diff --git a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py index 799cdd5b55..57dcf3d93e 100644 --- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py +++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py @@ -650,6 +650,41 @@ class TestTranscript(unittest.TestCase): with self.assertRaises(NotFoundError): transcripts_utils.Transcript.asset(None, None, filename=transcripts_utils.NON_EXISTENT_TRANSCRIPT) + def test_latin1(self): + """ + Test to make sure Latin-1 encoded transcripts work. + """ + latin1_sjson_str = textwrap.dedent("""\ + { + "start": [ + 10500, + 15000 + ], + "end": [ + 13000, + 18000 + ], + "text": [ + "û", + "At the left we can see..." + ] + } + """) + latin1_sjson_bytes = latin1_sjson_str.encode('latin-1') + + expected_result = textwrap.dedent("""\ + 0 + 00:00:10,500 --> 00:00:13,000 + û + + 1 + 00:00:15,000 --> 00:00:18,000 + At the left we can see... + + """) + result = transcripts_utils.Transcript.convert(latin1_sjson_bytes, 'sjson', 'srt') + assert result == expected_result + class TestSubsFilename(unittest.TestCase): """ diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py index 77c7f50fae..7abe134de1 100644 --- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py +++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py @@ -667,19 +667,27 @@ class Transcript: error_handling=SubRipFile.ERROR_RAISE ) except Error as ex: # Base exception from pysrt - raise TranscriptsGenerationException(str(ex)) # lint-amnesty, pylint: disable=raise-missing-from + raise TranscriptsGenerationException(str(ex)) from ex return json.dumps(generate_sjson_from_srt(srt_subs)) if input_format == 'sjson': + # If the JSON file content is bytes, try UTF-8, then Latin-1 + if isinstance(content, bytes): + try: + content_str = content.decode('utf-8') + except UnicodeDecodeError: + content_str = content.decode('latin-1') + else: + content_str = content if output_format == 'txt': - text = json.loads(content)['text'] + text = json.loads(content_str)['text'] text_without_none = [line if line else '' for line in text] return html.unescape("\n".join(text_without_none)) elif output_format == 'srt': - return generate_srt_from_sjson(json.loads(content), speed=1.0) + return generate_srt_from_sjson(json.loads(content_str), speed=1.0) @staticmethod def asset(location, subs_id, lang='en', filename=None):