Fix all transcript related tests.

Treat transcript content as unicode strings and convert them at any edge where we encounter them. One decision made here was to not update edx-val it treats transcripts as byte but is much closer to the actual files so it makes more sense over there. But within the platform they are generally passed around as serialized json and so it's much better for them to be unicode.
2019-10-02 16:32:41 -04:00
parent 7905640df2
commit 66382961a7
6 changed files with 27 additions and 18 deletions
--- a/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py
@@ -307,7 +307,7 @@ class TranscriptUploadTest(CourseTestCase):
        """
        Tests that transcript upload handler works as expected.
        """
-        transcript_file_stream = BytesIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n')
+        transcript_file_stream = six.StringIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n')
        # Make request to transcript upload handler
        response = self.client.post(
            self.view_url,
@@ -422,7 +422,7 @@ class TranscriptUploadTest(CourseTestCase):
        """
        Tests the transcript upload handler with an invalid transcript file.
        """
-        transcript_file_stream = BytesIO('An invalid transcript SubRip file content')
+        transcript_file_stream = six.StringIO('An invalid transcript SubRip file content')
        # Make request to transcript upload handler
        response = self.client.post(
            self.view_url,
--- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
@@ -35,7 +35,7 @@ from xmodule.video_module.transcripts_utils import (
 TEST_DATA_CONTENTSTORE = copy.deepcopy(settings.CONTENTSTORE)
 TEST_DATA_CONTENTSTORE['DOC_STORE_CONFIG']['db'] = 'test_xcontent_%s' % uuid4().hex

-SRT_TRANSCRIPT_CONTENT = b"""0
+SRT_TRANSCRIPT_CONTENT = u"""0
 00:00:10,500 --> 00:00:13,000
 Elephant's Dream

@@ -160,7 +160,7 @@ class TestUploadTranscripts(BaseTranscripts):
        super(TestUploadTranscripts, self).setUp()
        self.contents = {
            'good': SRT_TRANSCRIPT_CONTENT,
-            'bad': b'Some BAD data',
+            'bad': u'Some BAD data',
        }
        # Create temporary transcript files
        self.good_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.srt')
@@ -186,7 +186,7 @@ class TestUploadTranscripts(BaseTranscripts):
        Setup a transcript file with suffix and content.
        """
        transcript_file = tempfile.NamedTemporaryFile(suffix=suffix)
-        wrapped_content = textwrap.dedent(content.decode('utf-8'))
+        wrapped_content = textwrap.dedent(content)
        if include_bom:
            wrapped_content = wrapped_content.encode('utf-8-sig')
            # Verify that ufeff(BOM) character is in content.
@@ -791,7 +791,7 @@ class TestDownloadTranscripts(BaseTranscripts):
        """
        self.assertEqual(response.status_code, expected_status_code)
        if expected_content:
-            self.assertEqual(response.content, expected_content)
+            assert response.content.decode('utf-8') == expected_content

    def test_download_youtube_transcript_success(self):
        """
--- a/cms/djangoapps/contentstore/views/transcript_settings.py
+++ b/cms/djangoapps/contentstore/views/transcript_settings.py
@@ -229,7 +229,7 @@ def transcript_upload_handler(request):
            # Convert SRT transcript into an SJSON format
            # and upload it to S3.
            sjson_subs = Transcript.convert(
-                content=transcript_file.read(),
+                content=transcript_file.read().decode('utf-8'),
                input_format=Transcript.SRT,
                output_format=Transcript.SJSON
            )
--- a/cms/djangoapps/contentstore/views/transcripts_ajax.py
+++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py
@@ -219,7 +219,7 @@ def upload_transcripts(request):
            # Convert 'srt' transcript into the 'sjson' and upload it to
            # configured transcript storage. For example, S3.
            sjson_subs = Transcript.convert(
-                content=transcript_file.read(),
+                content=transcript_file.read().decode('utf-8'),
                input_format=Transcript.SRT,
                output_format=Transcript.SJSON
            )
@@ -322,7 +322,7 @@ def check_transcripts(request):
        filename = 'subs_{0}.srt.sjson'.format(item.sub)
        content_location = StaticContent.compute_location(item.location.course_key, filename)
        try:
-            local_transcripts = contentstore().find(content_location).data
+            local_transcripts = contentstore().find(content_location).data.decode('utf-8')
            transcripts_presence['current_item_subs'] = item.sub
        except NotFoundError:
            pass
@@ -336,7 +336,7 @@ def check_transcripts(request):
            filename = 'subs_{0}.srt.sjson'.format(youtube_id)
            content_location = StaticContent.compute_location(item.location.course_key, filename)
            try:
-                local_transcripts = contentstore().find(content_location).data
+                local_transcripts = contentstore().find(content_location).data.decode('utf-8')
                transcripts_presence['youtube_local'] = True
            except NotFoundError:
                log.debug(u"Can't find transcripts in storage for youtube id: %s", youtube_id)
--- a/common/lib/xmodule/xmodule/contentstore/mongo.py
+++ b/common/lib/xmodule/xmodule/contentstore/mongo.py
@@ -101,12 +101,19 @@ class MongoContentStore(ContentStore):
                              locked=getattr(content, 'locked', False)) as fp:

            # It seems that this code thought that only some specific object would have the `__iter__` attribute
-            # but the bytes object in python 3 has one and should not use the chunking logic.
-            if hasattr(content.data, '__iter__') and not isinstance(content.data, six.binary_type):
+            # but many more objects have this in python3 and shouldn't be using the chunking logic. For string and
+            # byte streams we write them directly to gridfs and convert them to byetarrys if necessary.
+            if hasattr(content.data, '__iter__') and not isinstance(content.data, (six.binary_type, six.string_types)):
                for chunk in content.data:
                    fp.write(chunk)
            else:
-                fp.write(content.data)
+                # Ideally we could just ensure that we don't get strings in here and only byte streams
+                # but being confident of that wolud be a lot more work than we have time for so we just
+                # handle both cases here.
+                if isinstance(content.data, six.text_type):
+                    fp.write(content.data.encode('utf-8'))
+                else:
+                    fp.write(content.data)

        return content

--- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
+++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
@@ -145,7 +145,7 @@ def youtube_video_transcript_name(youtube_text_api):
    # http://video.google.com/timedtext?type=list&v={VideoId}
    youtube_response = requests.get('http://' + youtube_text_api['url'], params=transcripts_param)
    if youtube_response.status_code == 200 and youtube_response.text:
-        youtube_data = etree.fromstring(youtube_response.content.encode('utf-8'), parser=utf8_parser)
+        youtube_data = etree.fromstring(youtube_response.text.encode('utf-8'), parser=utf8_parser)
        # iterate all transcripts information from youtube server
        for element in youtube_data:
            # search specific language code such as 'en' in transcripts info list
@@ -579,6 +579,8 @@ def get_video_transcript_content(edx_video_id, language_code):
    edx_video_id = clean_video_id(edx_video_id)
    if edxval_api and edx_video_id:
        transcript = edxval_api.get_video_transcript_data(edx_video_id, language_code)
+        if transcript and 'content' in transcript:
+            transcript['content'] = transcript['content'].decode('utf-8')

    return transcript

@@ -654,7 +656,7 @@ class Transcript(object):
        if input_format == 'srt':

            if output_format == 'txt':
-                text = SubRipFile.from_string(content.decode('utf-8')).text
+                text = SubRipFile.from_string(content).text
                return HTMLParser().unescape(text)

            elif output_format == 'sjson':
@@ -663,7 +665,7 @@ class Transcript(object):
                    # the exception if something went wrong in parsing the transcript.
                    srt_subs = SubRipFile.from_string(
                        # Skip byte order mark(BOM) character
-                        content.decode('utf-8-sig') if six.PY2 else content.encode('utf-8').decode('utf-8-sig'),
+                        content.encode('utf-8').decode('utf-8-sig'),
                        error_handling=SubRipFile.ERROR_RAISE
                    )
                except Error as ex:   # Base exception from pysrt
@@ -925,11 +927,11 @@ def get_transcript_for_video(video_location, subs_id, file_name, language):
    try:
        if subs_id is None:
            raise NotFoundError
-        content = Transcript.asset(video_location, subs_id, language).data
+        content = Transcript.asset(video_location, subs_id, language).data.decode('utf-8')
        base_name = subs_id
        input_format = Transcript.SJSON
    except NotFoundError:
-        content = Transcript.asset(video_location, None, language, file_name).data
+        content = Transcript.asset(video_location, None, language, file_name).data.decode('utf-8')
        base_name = os.path.splitext(file_name)[0]
        input_format = Transcript.SRT