From d2389fb7fb542d3d30e023af65f43006c3dc7731 Mon Sep 17 00:00:00 2001
From: David Ormsbee <dave@edx.org>
Date: Thu, 25 Mar 2021 10:10:14 -0400
Subject: [PATCH] fix: Don't break export when transcript is Latin-1 encoded.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Video SJSON transcripts are supposed to be UTF-8 encoded, but SJSON
is an ad hoc thing we made up to make it easier to build the
transcripts viewer in the VideoBlock, and it's not well specified.
Prior to this commit, if you had an SJSON file with Latin-1 encoded
text outside the standard ASCII range (e.g. û), then we'd error out
while trying to export it.

This was blocking an effort to export some Old Mongo courses (TNL-8007).
---
 .../tests/test_transcripts_utils.py           | 35 +++++++++++++++++++
 .../xmodule/video_module/transcripts_utils.py | 14 ++++++--
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py
index 799cdd5b55..57dcf3d93e 100644
--- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py
+++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py
@@ -650,6 +650,41 @@ class TestTranscript(unittest.TestCase):
         with self.assertRaises(NotFoundError):
             transcripts_utils.Transcript.asset(None, None, filename=transcripts_utils.NON_EXISTENT_TRANSCRIPT)
 
+    def test_latin1(self):
+        """
+        Test to make sure Latin-1 encoded transcripts work.
+        """
+        latin1_sjson_str = textwrap.dedent("""\
+            {
+                "start": [
+                    10500,
+                    15000
+                ],
+                "end": [
+                    13000,
+                    18000
+                ],
+                "text": [
+                    "û",
+                    "At the left we can see..."
+                ]
+            }
+        """)
+        latin1_sjson_bytes = latin1_sjson_str.encode('latin-1')
+
+        expected_result = textwrap.dedent("""\
+            0
+            00:00:10,500 --> 00:00:13,000
+            û
+
+            1
+            00:00:15,000 --> 00:00:18,000
+            At the left we can see...
+
+        """)
+        result = transcripts_utils.Transcript.convert(latin1_sjson_bytes, 'sjson', 'srt')
+        assert result == expected_result
+
 
 class TestSubsFilename(unittest.TestCase):
     """
diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
index 77c7f50fae..7abe134de1 100644
--- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
+++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
@@ -667,19 +667,27 @@ class Transcript:
                         error_handling=SubRipFile.ERROR_RAISE
                     )
                 except Error as ex:   # Base exception from pysrt
-                    raise TranscriptsGenerationException(str(ex))  # lint-amnesty, pylint: disable=raise-missing-from
+                    raise TranscriptsGenerationException(str(ex)) from ex
 
                 return json.dumps(generate_sjson_from_srt(srt_subs))
 
         if input_format == 'sjson':
+            # If the JSON file content is bytes, try UTF-8, then Latin-1
+            if isinstance(content, bytes):
+                try:
+                    content_str = content.decode('utf-8')
+                except UnicodeDecodeError:
+                    content_str = content.decode('latin-1')
+            else:
+                content_str = content
 
             if output_format == 'txt':
-                text = json.loads(content)['text']
+                text = json.loads(content_str)['text']
                 text_without_none = [line if line else '' for line in text]
                 return html.unescape("\n".join(text_without_none))
 
             elif output_format == 'srt':
-                return generate_srt_from_sjson(json.loads(content), speed=1.0)
+                return generate_srt_from_sjson(json.loads(content_str), speed=1.0)
 
     @staticmethod
     def asset(location, subs_id, lang='en', filename=None):