Merge pull request #22532 from edx/diana/transcript-handling

Standardize input into SubRipFile
This commit is contained in:
Diana Huang
2019-12-13 14:07:15 -05:00
committed by GitHub

View File

@@ -652,17 +652,16 @@ class Transcript(object):
return content
if input_format == 'srt':
# Standardize content into bytes for later decoding.
if isinstance(content, text_type):
content = content.encode('utf-8')
if output_format == 'txt':
text = SubRipFile.from_string(content).text
text = SubRipFile.from_string(content.decode('utf-8')).text
return HTMLParser().unescape(text)
elif output_format == 'sjson':
try:
# With error handling (set to 'ERROR_RAISE'), we will be getting
# the exception if something went wrong in parsing the transcript.
if isinstance(content, text_type):
content = content.encode('utf-8')
srt_subs = SubRipFile.from_string(
# Skip byte order mark(BOM) character
content.decode('utf-8-sig'),