diff --git a/cms/djangoapps/contentstore/views/tests/test_transcripts.py b/cms/djangoapps/contentstore/views/tests/test_transcripts.py index bc00f060a4..dbce92a022 100644 --- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py +++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py @@ -302,7 +302,7 @@ class TestUploadtranscripts(Basetranscripts): """ Test uploading subs containing BOM(Byte Order Mark), e.g. U+FEFF """ - filedate = textwrap.dedent(""" + filedata = textwrap.dedent(""" 1 00:00:10,500 --> 00:00:13,000 Test ufeff characters @@ -313,8 +313,8 @@ class TestUploadtranscripts(Basetranscripts): """).encode('utf-8-sig') # Verify that ufeff character is in filedata. - self.assertIn("ufeff", filedate) - self.ufeff_srt_file.write(filedate) + self.assertIn("ufeff", filedata) + self.ufeff_srt_file.write(filedata) self.ufeff_srt_file.seek(0) link = reverse('upload_transcripts') diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py index 2c1a10e71c..8dca188d9a 100644 --- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py +++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py @@ -398,10 +398,11 @@ def generate_sjson_for_all_speeds(item, user_filename, result_subs_dict, lang): if not lang: lang = item.transcript_language + # Used utf-8-sig encoding type instead of utf-8 to remove BOM(Byte Order Mark), e.g. U+FEFF generate_subs_from_source( result_subs_dict, os.path.splitext(user_filename)[1][1:], - srt_transcripts.data.decode('utf8'), + srt_transcripts.data.decode('utf-8-sig'), item, lang ) diff --git a/common/test/acceptance/tests/video/test_studio_video_editor.py b/common/test/acceptance/tests/video/test_studio_video_editor.py index d79a2ff4e9..31628bc2a1 100644 --- a/common/test/acceptance/tests/video/test_studio_video_editor.py +++ b/common/test/acceptance/tests/video/test_studio_video_editor.py @@ -489,3 +489,23 @@ class VideoEditorTest(CMSVideoBaseTest): self.assertIn(unicode_text, self.video.captions_text) self.assertEqual(self.video.caption_languages.keys(), [u'table', u'uk']) self.assertEqual(self.video.caption_languages.keys()[0], 'table') + + def test_upload_transcript_with_BOM(self): + """ + Scenario: User can upload transcript file with BOM(Byte Order Mark) in it. + Given I have created a Video component + And I edit the component + And I open tab "Advanced" + And I upload transcript file "chinese_transcripts_with_BOM.srt" for "zh" language code + And I save changes + Then when I view the video it does show the captions + And I see "莎拉·佩林 (Sarah Palin)" text in the captions + """ + self._create_video_component() + self.edit_component() + self.open_advanced_tab() + self.video.upload_translation('chinese_transcripts_with_BOM.srt', 'zh') + self.save_unit_settings() + self.assertTrue(self.video.is_captions_visible()) + unicode_text = "莎拉·佩林 (Sarah Palin)".decode('utf-8') + self.assertIn(unicode_text, self.video.captions_lines()) diff --git a/common/test/data/uploads/chinese_transcripts_with_BOM.srt b/common/test/data/uploads/chinese_transcripts_with_BOM.srt new file mode 100644 index 0000000000..dc7eb407cb --- /dev/null +++ b/common/test/data/uploads/chinese_transcripts_with_BOM.srt @@ -0,0 +1,19 @@ +1 +00:00:16,850 --> 00:00:23,850 +莎拉·佩林 (Sarah Palin) 的著作《我行我素》被乔纳森·拉班(Jonathan Raban) 评论为“400页对高尚无知的赞美” + +2 +00:00:24,040 --> 00:00:30,680 +他是什么意思呢?拉班所指的那种思想 + +3 +00:00:30,680 --> 00:00:35,660 +可以用“我不太懂艺术 但我知道我喜欢什么”做比喻 + +4 +00:00:35,660 --> 00:00:42,410 +他将其描述为“常识性保守派” + +5 +00:00:42,410 --> 00:00:47,510 +即占据道德制高点的外行人能比专家更好地评价 比方说