Merge pull request #6582 from edx/waheed/tnl935-fix-transcript-skip-first-line

Fixed transcript skip first line if it contains BOM(Byte Order Mark).
2015-01-16 15:00:21 +05:00
parent 2db00c5723 47a372282a
commit 5958ffb33d
2 changed files with 44 additions and 1 deletions
--- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
@@ -116,6 +116,8 @@ class TestUploadtranscripts(Basetranscripts):
        """))
        self.bad_name_srt_file.seek(0)

+        self.ufeff_srt_file = tempfile.NamedTemporaryFile(suffix='.srt')
+
    def test_success_video_module_source_subs_uploading(self):
        self.item.data = textwrap.dedent("""
            <video youtube="">
@@ -296,12 +298,52 @@ class TestUploadtranscripts(Basetranscripts):
        self.assertEqual(resp.status_code, 400)
        self.assertEqual(json.loads(resp.content).get('status'), 'Undefined file extension.')

+    def test_subs_uploading_with_byte_order_mark(self):
+        """
+        Test uploading subs containing BOM(Byte Order Mark), e.g. U+FEFF
+        """
+        filedate = textwrap.dedent("""
+            1
+            00:00:10,500 --> 00:00:13,000
+            Test ufeff characters
+
+            2
+            00:00:15,000 --> 00:00:18,000
+            At the left we can see...
+        """).encode('utf-8-sig')
+
+        # Verify that ufeff character is in filedata.
+        self.assertIn("ufeff", filedate)
+        self.ufeff_srt_file.write(filedate)
+        self.ufeff_srt_file.seek(0)
+
+        link = reverse('upload_transcripts')
+        filename = os.path.splitext(os.path.basename(self.ufeff_srt_file.name))[0]
+        resp = self.client.post(link, {
+            'locator': self.video_usage_key,
+            'transcript-file': self.ufeff_srt_file,
+            'video_list': json.dumps([{
+                'type': 'html5',
+                'video': filename,
+                'mode': 'mp4',
+            }])
+        })
+        self.assertEqual(resp.status_code, 200)
+
+        content_location = StaticContent.compute_location(
+            self.course.id, 'subs_{0}.srt.sjson'.format(filename))
+        self.assertTrue(contentstore().find(content_location))
+
+        subs_text = json.loads(contentstore().find(content_location).data).get('text')
+        self.assertIn("Test ufeff characters", subs_text)
+
    def tearDown(self):
        super(TestUploadtranscripts, self).tearDown()

        self.good_srt_file.close()
        self.bad_data_srt_file.close()
        self.bad_name_srt_file.close()
+        self.ufeff_srt_file.close()


 class TestDownloadtranscripts(Basetranscripts):
--- a/cms/djangoapps/contentstore/views/transcripts_ajax.py
+++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py
@@ -100,7 +100,8 @@ def upload_transcripts(request):
    except ValueError:
        return error_response(response, 'Invalid video_list JSON.')

-    source_subs_filedata = request.FILES['transcript-file'].read().decode('utf8')
+    # Used utf-8-sig encoding type instead of utf-8 to remove BOM(Byte Order Mark), e.g. U+FEFF
+    source_subs_filedata = request.FILES['transcript-file'].read().decode('utf-8-sig')
    source_subs_filename = request.FILES['transcript-file'].name

    if '.' not in source_subs_filename: