Fix all transcript related tests.

Treat transcript content as unicode strings and convert them at any edge
where we encounter them.  One decision made here was to not update
edx-val it treats transcripts as byte but is much closer to the actual
files so it makes more sense over there. But within the platform they
are generally passed around as serialized json and so it's much better
for them to be unicode.
This commit is contained in:
Feanil Patel
2019-10-02 16:32:41 -04:00
parent 7905640df2
commit 66382961a7
6 changed files with 27 additions and 18 deletions

View File

@@ -307,7 +307,7 @@ class TranscriptUploadTest(CourseTestCase):
"""
Tests that transcript upload handler works as expected.
"""
transcript_file_stream = BytesIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n')
transcript_file_stream = six.StringIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n')
# Make request to transcript upload handler
response = self.client.post(
self.view_url,
@@ -422,7 +422,7 @@ class TranscriptUploadTest(CourseTestCase):
"""
Tests the transcript upload handler with an invalid transcript file.
"""
transcript_file_stream = BytesIO('An invalid transcript SubRip file content')
transcript_file_stream = six.StringIO('An invalid transcript SubRip file content')
# Make request to transcript upload handler
response = self.client.post(
self.view_url,

View File

@@ -35,7 +35,7 @@ from xmodule.video_module.transcripts_utils import (
TEST_DATA_CONTENTSTORE = copy.deepcopy(settings.CONTENTSTORE)
TEST_DATA_CONTENTSTORE['DOC_STORE_CONFIG']['db'] = 'test_xcontent_%s' % uuid4().hex
SRT_TRANSCRIPT_CONTENT = b"""0
SRT_TRANSCRIPT_CONTENT = u"""0
00:00:10,500 --> 00:00:13,000
Elephant's Dream
@@ -160,7 +160,7 @@ class TestUploadTranscripts(BaseTranscripts):
super(TestUploadTranscripts, self).setUp()
self.contents = {
'good': SRT_TRANSCRIPT_CONTENT,
'bad': b'Some BAD data',
'bad': u'Some BAD data',
}
# Create temporary transcript files
self.good_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.srt')
@@ -186,7 +186,7 @@ class TestUploadTranscripts(BaseTranscripts):
Setup a transcript file with suffix and content.
"""
transcript_file = tempfile.NamedTemporaryFile(suffix=suffix)
wrapped_content = textwrap.dedent(content.decode('utf-8'))
wrapped_content = textwrap.dedent(content)
if include_bom:
wrapped_content = wrapped_content.encode('utf-8-sig')
# Verify that ufeff(BOM) character is in content.
@@ -791,7 +791,7 @@ class TestDownloadTranscripts(BaseTranscripts):
"""
self.assertEqual(response.status_code, expected_status_code)
if expected_content:
self.assertEqual(response.content, expected_content)
assert response.content.decode('utf-8') == expected_content
def test_download_youtube_transcript_success(self):
"""

View File

@@ -229,7 +229,7 @@ def transcript_upload_handler(request):
# Convert SRT transcript into an SJSON format
# and upload it to S3.
sjson_subs = Transcript.convert(
content=transcript_file.read(),
content=transcript_file.read().decode('utf-8'),
input_format=Transcript.SRT,
output_format=Transcript.SJSON
)

View File

@@ -219,7 +219,7 @@ def upload_transcripts(request):
# Convert 'srt' transcript into the 'sjson' and upload it to
# configured transcript storage. For example, S3.
sjson_subs = Transcript.convert(
content=transcript_file.read(),
content=transcript_file.read().decode('utf-8'),
input_format=Transcript.SRT,
output_format=Transcript.SJSON
)
@@ -322,7 +322,7 @@ def check_transcripts(request):
filename = 'subs_{0}.srt.sjson'.format(item.sub)
content_location = StaticContent.compute_location(item.location.course_key, filename)
try:
local_transcripts = contentstore().find(content_location).data
local_transcripts = contentstore().find(content_location).data.decode('utf-8')
transcripts_presence['current_item_subs'] = item.sub
except NotFoundError:
pass
@@ -336,7 +336,7 @@ def check_transcripts(request):
filename = 'subs_{0}.srt.sjson'.format(youtube_id)
content_location = StaticContent.compute_location(item.location.course_key, filename)
try:
local_transcripts = contentstore().find(content_location).data
local_transcripts = contentstore().find(content_location).data.decode('utf-8')
transcripts_presence['youtube_local'] = True
except NotFoundError:
log.debug(u"Can't find transcripts in storage for youtube id: %s", youtube_id)

View File

@@ -101,12 +101,19 @@ class MongoContentStore(ContentStore):
locked=getattr(content, 'locked', False)) as fp:
# It seems that this code thought that only some specific object would have the `__iter__` attribute
# but the bytes object in python 3 has one and should not use the chunking logic.
if hasattr(content.data, '__iter__') and not isinstance(content.data, six.binary_type):
# but many more objects have this in python3 and shouldn't be using the chunking logic. For string and
# byte streams we write them directly to gridfs and convert them to byetarrys if necessary.
if hasattr(content.data, '__iter__') and not isinstance(content.data, (six.binary_type, six.string_types)):
for chunk in content.data:
fp.write(chunk)
else:
fp.write(content.data)
# Ideally we could just ensure that we don't get strings in here and only byte streams
# but being confident of that wolud be a lot more work than we have time for so we just
# handle both cases here.
if isinstance(content.data, six.text_type):
fp.write(content.data.encode('utf-8'))
else:
fp.write(content.data)
return content

View File

@@ -145,7 +145,7 @@ def youtube_video_transcript_name(youtube_text_api):
# http://video.google.com/timedtext?type=list&v={VideoId}
youtube_response = requests.get('http://' + youtube_text_api['url'], params=transcripts_param)
if youtube_response.status_code == 200 and youtube_response.text:
youtube_data = etree.fromstring(youtube_response.content.encode('utf-8'), parser=utf8_parser)
youtube_data = etree.fromstring(youtube_response.text.encode('utf-8'), parser=utf8_parser)
# iterate all transcripts information from youtube server
for element in youtube_data:
# search specific language code such as 'en' in transcripts info list
@@ -579,6 +579,8 @@ def get_video_transcript_content(edx_video_id, language_code):
edx_video_id = clean_video_id(edx_video_id)
if edxval_api and edx_video_id:
transcript = edxval_api.get_video_transcript_data(edx_video_id, language_code)
if transcript and 'content' in transcript:
transcript['content'] = transcript['content'].decode('utf-8')
return transcript
@@ -654,7 +656,7 @@ class Transcript(object):
if input_format == 'srt':
if output_format == 'txt':
text = SubRipFile.from_string(content.decode('utf-8')).text
text = SubRipFile.from_string(content).text
return HTMLParser().unescape(text)
elif output_format == 'sjson':
@@ -663,7 +665,7 @@ class Transcript(object):
# the exception if something went wrong in parsing the transcript.
srt_subs = SubRipFile.from_string(
# Skip byte order mark(BOM) character
content.decode('utf-8-sig') if six.PY2 else content.encode('utf-8').decode('utf-8-sig'),
content.encode('utf-8').decode('utf-8-sig'),
error_handling=SubRipFile.ERROR_RAISE
)
except Error as ex: # Base exception from pysrt
@@ -925,11 +927,11 @@ def get_transcript_for_video(video_location, subs_id, file_name, language):
try:
if subs_id is None:
raise NotFoundError
content = Transcript.asset(video_location, subs_id, language).data
content = Transcript.asset(video_location, subs_id, language).data.decode('utf-8')
base_name = subs_id
input_format = Transcript.SJSON
except NotFoundError:
content = Transcript.asset(video_location, None, language, file_name).data
content = Transcript.asset(video_location, None, language, file_name).data.decode('utf-8')
base_name = os.path.splitext(file_name)[0]
input_format = Transcript.SRT