From 66382961a782dfc1cd33cd6f15c75b600f7fb314 Mon Sep 17 00:00:00 2001
From: Feanil Patel <feanil@edx.org>
Date: Wed, 2 Oct 2019 16:32:41 -0400
Subject: [PATCH] Fix all transcript related tests.

Treat transcript content as unicode strings and convert them at any edge
where we encounter them.  One decision made here was to not update
edx-val it treats transcripts as byte but is much closer to the actual
files so it makes more sense over there. But within the platform they
are generally passed around as serialized json and so it's much better
for them to be unicode.
---
 .../views/tests/test_transcript_settings.py         |  4 ++--
 .../contentstore/views/tests/test_transcripts.py    |  8 ++++----
 .../contentstore/views/transcript_settings.py       |  2 +-
 .../contentstore/views/transcripts_ajax.py          |  6 +++---
 common/lib/xmodule/xmodule/contentstore/mongo.py    | 13 ++++++++++---
 .../xmodule/video_module/transcripts_utils.py       | 12 +++++++-----
 6 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py b/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py
index b89e7992a0..f13705a121 100644
--- a/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py
@@ -307,7 +307,7 @@ class TranscriptUploadTest(CourseTestCase):
         """
         Tests that transcript upload handler works as expected.
         """
-        transcript_file_stream = BytesIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n')
+        transcript_file_stream = six.StringIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n')
         # Make request to transcript upload handler
         response = self.client.post(
             self.view_url,
@@ -422,7 +422,7 @@ class TranscriptUploadTest(CourseTestCase):
         """
         Tests the transcript upload handler with an invalid transcript file.
         """
-        transcript_file_stream = BytesIO('An invalid transcript SubRip file content')
+        transcript_file_stream = six.StringIO('An invalid transcript SubRip file content')
         # Make request to transcript upload handler
         response = self.client.post(
             self.view_url,
diff --git a/cms/djangoapps/contentstore/views/tests/test_transcripts.py b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
index 3557c7a21a..4273daef9b 100644
--- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
@@ -35,7 +35,7 @@ from xmodule.video_module.transcripts_utils import (
 TEST_DATA_CONTENTSTORE = copy.deepcopy(settings.CONTENTSTORE)
 TEST_DATA_CONTENTSTORE['DOC_STORE_CONFIG']['db'] = 'test_xcontent_%s' % uuid4().hex
 
-SRT_TRANSCRIPT_CONTENT = b"""0
+SRT_TRANSCRIPT_CONTENT = u"""0
 00:00:10,500 --> 00:00:13,000
 Elephant's Dream
 
@@ -160,7 +160,7 @@ class TestUploadTranscripts(BaseTranscripts):
         super(TestUploadTranscripts, self).setUp()
         self.contents = {
             'good': SRT_TRANSCRIPT_CONTENT,
-            'bad': b'Some BAD data',
+            'bad': u'Some BAD data',
         }
         # Create temporary transcript files
         self.good_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.srt')
@@ -186,7 +186,7 @@ class TestUploadTranscripts(BaseTranscripts):
         Setup a transcript file with suffix and content.
         """
         transcript_file = tempfile.NamedTemporaryFile(suffix=suffix)
-        wrapped_content = textwrap.dedent(content.decode('utf-8'))
+        wrapped_content = textwrap.dedent(content)
         if include_bom:
             wrapped_content = wrapped_content.encode('utf-8-sig')
             # Verify that ufeff(BOM) character is in content.
@@ -791,7 +791,7 @@ class TestDownloadTranscripts(BaseTranscripts):
         """
         self.assertEqual(response.status_code, expected_status_code)
         if expected_content:
-            self.assertEqual(response.content, expected_content)
+            assert response.content.decode('utf-8') == expected_content
 
     def test_download_youtube_transcript_success(self):
         """
diff --git a/cms/djangoapps/contentstore/views/transcript_settings.py b/cms/djangoapps/contentstore/views/transcript_settings.py
index d4457a63f6..03105f005b 100644
--- a/cms/djangoapps/contentstore/views/transcript_settings.py
+++ b/cms/djangoapps/contentstore/views/transcript_settings.py
@@ -229,7 +229,7 @@ def transcript_upload_handler(request):
             # Convert SRT transcript into an SJSON format
             # and upload it to S3.
             sjson_subs = Transcript.convert(
-                content=transcript_file.read(),
+                content=transcript_file.read().decode('utf-8'),
                 input_format=Transcript.SRT,
                 output_format=Transcript.SJSON
             )
diff --git a/cms/djangoapps/contentstore/views/transcripts_ajax.py b/cms/djangoapps/contentstore/views/transcripts_ajax.py
index 8c1a76b722..4510aa84ae 100644
--- a/cms/djangoapps/contentstore/views/transcripts_ajax.py
+++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py
@@ -219,7 +219,7 @@ def upload_transcripts(request):
             # Convert 'srt' transcript into the 'sjson' and upload it to
             # configured transcript storage. For example, S3.
             sjson_subs = Transcript.convert(
-                content=transcript_file.read(),
+                content=transcript_file.read().decode('utf-8'),
                 input_format=Transcript.SRT,
                 output_format=Transcript.SJSON
             )
@@ -322,7 +322,7 @@ def check_transcripts(request):
         filename = 'subs_{0}.srt.sjson'.format(item.sub)
         content_location = StaticContent.compute_location(item.location.course_key, filename)
         try:
-            local_transcripts = contentstore().find(content_location).data
+            local_transcripts = contentstore().find(content_location).data.decode('utf-8')
             transcripts_presence['current_item_subs'] = item.sub
         except NotFoundError:
             pass
@@ -336,7 +336,7 @@ def check_transcripts(request):
             filename = 'subs_{0}.srt.sjson'.format(youtube_id)
             content_location = StaticContent.compute_location(item.location.course_key, filename)
             try:
-                local_transcripts = contentstore().find(content_location).data
+                local_transcripts = contentstore().find(content_location).data.decode('utf-8')
                 transcripts_presence['youtube_local'] = True
             except NotFoundError:
                 log.debug(u"Can't find transcripts in storage for youtube id: %s", youtube_id)
diff --git a/common/lib/xmodule/xmodule/contentstore/mongo.py b/common/lib/xmodule/xmodule/contentstore/mongo.py
index 5ff01ddfae..ae747f97fa 100644
--- a/common/lib/xmodule/xmodule/contentstore/mongo.py
+++ b/common/lib/xmodule/xmodule/contentstore/mongo.py
@@ -101,12 +101,19 @@ class MongoContentStore(ContentStore):
                               locked=getattr(content, 'locked', False)) as fp:
 
             # It seems that this code thought that only some specific object would have the `__iter__` attribute
-            # but the bytes object in python 3 has one and should not use the chunking logic.
-            if hasattr(content.data, '__iter__') and not isinstance(content.data, six.binary_type):
+            # but many more objects have this in python3 and shouldn't be using the chunking logic. For string and
+            # byte streams we write them directly to gridfs and convert them to byetarrys if necessary.
+            if hasattr(content.data, '__iter__') and not isinstance(content.data, (six.binary_type, six.string_types)):
                 for chunk in content.data:
                     fp.write(chunk)
             else:
-                fp.write(content.data)
+                # Ideally we could just ensure that we don't get strings in here and only byte streams
+                # but being confident of that wolud be a lot more work than we have time for so we just
+                # handle both cases here.
+                if isinstance(content.data, six.text_type):
+                    fp.write(content.data.encode('utf-8'))
+                else:
+                    fp.write(content.data)
 
         return content
 
diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
index 1e334ffc20..aa1b34c101 100644
--- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
+++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
@@ -145,7 +145,7 @@ def youtube_video_transcript_name(youtube_text_api):
     # http://video.google.com/timedtext?type=list&v={VideoId}
     youtube_response = requests.get('http://' + youtube_text_api['url'], params=transcripts_param)
     if youtube_response.status_code == 200 and youtube_response.text:
-        youtube_data = etree.fromstring(youtube_response.content.encode('utf-8'), parser=utf8_parser)
+        youtube_data = etree.fromstring(youtube_response.text.encode('utf-8'), parser=utf8_parser)
         # iterate all transcripts information from youtube server
         for element in youtube_data:
             # search specific language code such as 'en' in transcripts info list
@@ -579,6 +579,8 @@ def get_video_transcript_content(edx_video_id, language_code):
     edx_video_id = clean_video_id(edx_video_id)
     if edxval_api and edx_video_id:
         transcript = edxval_api.get_video_transcript_data(edx_video_id, language_code)
+        if transcript and 'content' in transcript:
+            transcript['content'] = transcript['content'].decode('utf-8')
 
     return transcript
 
@@ -654,7 +656,7 @@ class Transcript(object):
         if input_format == 'srt':
 
             if output_format == 'txt':
-                text = SubRipFile.from_string(content.decode('utf-8')).text
+                text = SubRipFile.from_string(content).text
                 return HTMLParser().unescape(text)
 
             elif output_format == 'sjson':
@@ -663,7 +665,7 @@ class Transcript(object):
                     # the exception if something went wrong in parsing the transcript.
                     srt_subs = SubRipFile.from_string(
                         # Skip byte order mark(BOM) character
-                        content.decode('utf-8-sig') if six.PY2 else content.encode('utf-8').decode('utf-8-sig'),
+                        content.encode('utf-8').decode('utf-8-sig'),
                         error_handling=SubRipFile.ERROR_RAISE
                     )
                 except Error as ex:   # Base exception from pysrt
@@ -925,11 +927,11 @@ def get_transcript_for_video(video_location, subs_id, file_name, language):
     try:
         if subs_id is None:
             raise NotFoundError
-        content = Transcript.asset(video_location, subs_id, language).data
+        content = Transcript.asset(video_location, subs_id, language).data.decode('utf-8')
         base_name = subs_id
         input_format = Transcript.SJSON
     except NotFoundError:
-        content = Transcript.asset(video_location, None, language, file_name).data
+        content = Transcript.asset(video_location, None, language, file_name).data.decode('utf-8')
         base_name = os.path.splitext(file_name)[0]
         input_format = Transcript.SRT