fix: Don't break export when transcript is Latin-1 encoded.
Video SJSON transcripts are supposed to be UTF-8 encoded, but SJSON is an ad hoc thing we made up to make it easier to build the transcripts viewer in the VideoBlock, and it's not well specified. Prior to this commit, if you had an SJSON file with Latin-1 encoded text outside the standard ASCII range (e.g. û), then we'd error out while trying to export it. This was blocking an effort to export some Old Mongo courses (TNL-8007).
This commit is contained in:
@@ -650,6 +650,41 @@ class TestTranscript(unittest.TestCase):
|
||||
with self.assertRaises(NotFoundError):
|
||||
transcripts_utils.Transcript.asset(None, None, filename=transcripts_utils.NON_EXISTENT_TRANSCRIPT)
|
||||
|
||||
def test_latin1(self):
|
||||
"""
|
||||
Test to make sure Latin-1 encoded transcripts work.
|
||||
"""
|
||||
latin1_sjson_str = textwrap.dedent("""\
|
||||
{
|
||||
"start": [
|
||||
10500,
|
||||
15000
|
||||
],
|
||||
"end": [
|
||||
13000,
|
||||
18000
|
||||
],
|
||||
"text": [
|
||||
"û",
|
||||
"At the left we can see..."
|
||||
]
|
||||
}
|
||||
""")
|
||||
latin1_sjson_bytes = latin1_sjson_str.encode('latin-1')
|
||||
|
||||
expected_result = textwrap.dedent("""\
|
||||
0
|
||||
00:00:10,500 --> 00:00:13,000
|
||||
û
|
||||
|
||||
1
|
||||
00:00:15,000 --> 00:00:18,000
|
||||
At the left we can see...
|
||||
|
||||
""")
|
||||
result = transcripts_utils.Transcript.convert(latin1_sjson_bytes, 'sjson', 'srt')
|
||||
assert result == expected_result
|
||||
|
||||
|
||||
class TestSubsFilename(unittest.TestCase):
|
||||
"""
|
||||
|
||||
@@ -667,19 +667,27 @@ class Transcript:
|
||||
error_handling=SubRipFile.ERROR_RAISE
|
||||
)
|
||||
except Error as ex: # Base exception from pysrt
|
||||
raise TranscriptsGenerationException(str(ex)) # lint-amnesty, pylint: disable=raise-missing-from
|
||||
raise TranscriptsGenerationException(str(ex)) from ex
|
||||
|
||||
return json.dumps(generate_sjson_from_srt(srt_subs))
|
||||
|
||||
if input_format == 'sjson':
|
||||
# If the JSON file content is bytes, try UTF-8, then Latin-1
|
||||
if isinstance(content, bytes):
|
||||
try:
|
||||
content_str = content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
content_str = content.decode('latin-1')
|
||||
else:
|
||||
content_str = content
|
||||
|
||||
if output_format == 'txt':
|
||||
text = json.loads(content)['text']
|
||||
text = json.loads(content_str)['text']
|
||||
text_without_none = [line if line else '' for line in text]
|
||||
return html.unescape("\n".join(text_without_none))
|
||||
|
||||
elif output_format == 'srt':
|
||||
return generate_srt_from_sjson(json.loads(content), speed=1.0)
|
||||
return generate_srt_from_sjson(json.loads(content_str), speed=1.0)
|
||||
|
||||
@staticmethod
|
||||
def asset(location, subs_id, lang='en', filename=None):
|
||||
|
||||
Reference in New Issue
Block a user