fix: Don't break export when transcript is Latin-1 encoded.

Video SJSON transcripts are supposed to be UTF-8 encoded, but SJSON
is an ad hoc thing we made up to make it easier to build the
transcripts viewer in the VideoBlock, and it's not well specified.
Prior to this commit, if you had an SJSON file with Latin-1 encoded
text outside the standard ASCII range (e.g. û), then we'd error out
while trying to export it.

This was blocking an effort to export some Old Mongo courses (TNL-8007).
This commit is contained in:
David Ormsbee
2021-03-25 10:10:14 -04:00
parent 2641336fc1
commit d2389fb7fb
2 changed files with 46 additions and 3 deletions

View File

@@ -650,6 +650,41 @@ class TestTranscript(unittest.TestCase):
with self.assertRaises(NotFoundError):
transcripts_utils.Transcript.asset(None, None, filename=transcripts_utils.NON_EXISTENT_TRANSCRIPT)
def test_latin1(self):
"""
Test to make sure Latin-1 encoded transcripts work.
"""
latin1_sjson_str = textwrap.dedent("""\
{
"start": [
10500,
15000
],
"end": [
13000,
18000
],
"text": [
"û",
"At the left we can see..."
]
}
""")
latin1_sjson_bytes = latin1_sjson_str.encode('latin-1')
expected_result = textwrap.dedent("""\
0
00:00:10,500 --> 00:00:13,000
û
1
00:00:15,000 --> 00:00:18,000
At the left we can see...
""")
result = transcripts_utils.Transcript.convert(latin1_sjson_bytes, 'sjson', 'srt')
assert result == expected_result
class TestSubsFilename(unittest.TestCase):
"""

View File

@@ -667,19 +667,27 @@ class Transcript:
error_handling=SubRipFile.ERROR_RAISE
)
except Error as ex: # Base exception from pysrt
raise TranscriptsGenerationException(str(ex)) # lint-amnesty, pylint: disable=raise-missing-from
raise TranscriptsGenerationException(str(ex)) from ex
return json.dumps(generate_sjson_from_srt(srt_subs))
if input_format == 'sjson':
# If the JSON file content is bytes, try UTF-8, then Latin-1
if isinstance(content, bytes):
try:
content_str = content.decode('utf-8')
except UnicodeDecodeError:
content_str = content.decode('latin-1')
else:
content_str = content
if output_format == 'txt':
text = json.loads(content)['text']
text = json.loads(content_str)['text']
text_without_none = [line if line else '' for line in text]
return html.unescape("\n".join(text_without_none))
elif output_format == 'srt':
return generate_srt_from_sjson(json.loads(content), speed=1.0)
return generate_srt_from_sjson(json.loads(content_str), speed=1.0)
@staticmethod
def asset(location, subs_id, lang='en', filename=None):