From d2389fb7fb542d3d30e023af65f43006c3dc7731 Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Thu, 25 Mar 2021 10:10:14 -0400 Subject: [PATCH] fix: Don't break export when transcript is Latin-1 encoded. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Video SJSON transcripts are supposed to be UTF-8 encoded, but SJSON is an ad hoc thing we made up to make it easier to build the transcripts viewer in the VideoBlock, and it's not well specified. Prior to this commit, if you had an SJSON file with Latin-1 encoded text outside the standard ASCII range (e.g. û), then we'd error out while trying to export it. This was blocking an effort to export some Old Mongo courses (TNL-8007). --- .../tests/test_transcripts_utils.py | 35 +++++++++++++++++++ .../xmodule/video_module/transcripts_utils.py | 14 ++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py index 799cdd5b55..57dcf3d93e 100644 --- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py +++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py @@ -650,6 +650,41 @@ class TestTranscript(unittest.TestCase): with self.assertRaises(NotFoundError): transcripts_utils.Transcript.asset(None, None, filename=transcripts_utils.NON_EXISTENT_TRANSCRIPT) + def test_latin1(self): + """ + Test to make sure Latin-1 encoded transcripts work. + """ + latin1_sjson_str = textwrap.dedent("""\ + { + "start": [ + 10500, + 15000 + ], + "end": [ + 13000, + 18000 + ], + "text": [ + "û", + "At the left we can see..." + ] + } + """) + latin1_sjson_bytes = latin1_sjson_str.encode('latin-1') + + expected_result = textwrap.dedent("""\ + 0 + 00:00:10,500 --> 00:00:13,000 + û + + 1 + 00:00:15,000 --> 00:00:18,000 + At the left we can see... + + """) + result = transcripts_utils.Transcript.convert(latin1_sjson_bytes, 'sjson', 'srt') + assert result == expected_result + class TestSubsFilename(unittest.TestCase): """ diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py index 77c7f50fae..7abe134de1 100644 --- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py +++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py @@ -667,19 +667,27 @@ class Transcript: error_handling=SubRipFile.ERROR_RAISE ) except Error as ex: # Base exception from pysrt - raise TranscriptsGenerationException(str(ex)) # lint-amnesty, pylint: disable=raise-missing-from + raise TranscriptsGenerationException(str(ex)) from ex return json.dumps(generate_sjson_from_srt(srt_subs)) if input_format == 'sjson': + # If the JSON file content is bytes, try UTF-8, then Latin-1 + if isinstance(content, bytes): + try: + content_str = content.decode('utf-8') + except UnicodeDecodeError: + content_str = content.decode('latin-1') + else: + content_str = content if output_format == 'txt': - text = json.loads(content)['text'] + text = json.loads(content_str)['text'] text_without_none = [line if line else '' for line in text] return html.unescape("\n".join(text_without_none)) elif output_format == 'srt': - return generate_srt_from_sjson(json.loads(content), speed=1.0) + return generate_srt_from_sjson(json.loads(content_str), speed=1.0) @staticmethod def asset(location, subs_id, lang='en', filename=None):