fix: Don't break export when transcript is Latin-1 encoded.

Video SJSON transcripts are supposed to be UTF-8 encoded, but SJSON is an ad hoc thing we made up to make it easier to build the transcripts viewer in the VideoBlock, and it's not well specified. Prior to this commit, if you had an SJSON file with Latin-1 encoded text outside the standard ASCII range (e.g. û), then we'd error out while trying to export it. This was blocking an effort to export some Old Mongo courses (TNL-8007).
2021-03-25 10:10:14 -04:00
parent 2641336fc1
commit d2389fb7fb
2 changed files with 46 additions and 3 deletions
--- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py
+++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py
@@ -650,6 +650,41 @@ class TestTranscript(unittest.TestCase):
        with self.assertRaises(NotFoundError):
            transcripts_utils.Transcript.asset(None, None, filename=transcripts_utils.NON_EXISTENT_TRANSCRIPT)

+    def test_latin1(self):
+        """
+        Test to make sure Latin-1 encoded transcripts work.
+        """
+        latin1_sjson_str = textwrap.dedent("""\
+            {
+                "start": [
+                    10500,
+                    15000
+                ],
+                "end": [
+                    13000,
+                    18000
+                ],
+                "text": [
+                    "û",
+                    "At the left we can see..."
+                ]
+            }
+        """)
+        latin1_sjson_bytes = latin1_sjson_str.encode('latin-1')
+
+        expected_result = textwrap.dedent("""\
+            0
+            00:00:10,500 --> 00:00:13,000
+            û
+
+            1
+            00:00:15,000 --> 00:00:18,000
+            At the left we can see...
+
+        """)
+        result = transcripts_utils.Transcript.convert(latin1_sjson_bytes, 'sjson', 'srt')
+        assert result == expected_result
+

 class TestSubsFilename(unittest.TestCase):
    """
--- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
+++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
@@ -667,19 +667,27 @@ class Transcript:
                        error_handling=SubRipFile.ERROR_RAISE
                    )
                except Error as ex:   # Base exception from pysrt
-                    raise TranscriptsGenerationException(str(ex))  # lint-amnesty, pylint: disable=raise-missing-from
+                    raise TranscriptsGenerationException(str(ex)) from ex

                return json.dumps(generate_sjson_from_srt(srt_subs))

        if input_format == 'sjson':
+            # If the JSON file content is bytes, try UTF-8, then Latin-1
+            if isinstance(content, bytes):
+                try:
+                    content_str = content.decode('utf-8')
+                except UnicodeDecodeError:
+                    content_str = content.decode('latin-1')
+            else:
+                content_str = content

            if output_format == 'txt':
-                text = json.loads(content)['text']
+                text = json.loads(content_str)['text']
                text_without_none = [line if line else '' for line in text]
                return html.unescape("\n".join(text_without_none))

            elif output_format == 'srt':
-                return generate_srt_from_sjson(json.loads(content), speed=1.0)
+                return generate_srt_from_sjson(json.loads(content_str), speed=1.0)

    @staticmethod
    def asset(location, subs_id, lang='en', filename=None):