diff --git a/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py b/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py index b89e7992a0..f13705a121 100644 --- a/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py +++ b/cms/djangoapps/contentstore/views/tests/test_transcript_settings.py @@ -307,7 +307,7 @@ class TranscriptUploadTest(CourseTestCase): """ Tests that transcript upload handler works as expected. """ - transcript_file_stream = BytesIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n') + transcript_file_stream = six.StringIO('0\n00:00:00,010 --> 00:00:00,100\nПривіт, edX вітає вас.\n\n') # Make request to transcript upload handler response = self.client.post( self.view_url, @@ -422,7 +422,7 @@ class TranscriptUploadTest(CourseTestCase): """ Tests the transcript upload handler with an invalid transcript file. """ - transcript_file_stream = BytesIO('An invalid transcript SubRip file content') + transcript_file_stream = six.StringIO('An invalid transcript SubRip file content') # Make request to transcript upload handler response = self.client.post( self.view_url, diff --git a/cms/djangoapps/contentstore/views/tests/test_transcripts.py b/cms/djangoapps/contentstore/views/tests/test_transcripts.py index 3557c7a21a..4273daef9b 100644 --- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py +++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py @@ -35,7 +35,7 @@ from xmodule.video_module.transcripts_utils import ( TEST_DATA_CONTENTSTORE = copy.deepcopy(settings.CONTENTSTORE) TEST_DATA_CONTENTSTORE['DOC_STORE_CONFIG']['db'] = 'test_xcontent_%s' % uuid4().hex -SRT_TRANSCRIPT_CONTENT = b"""0 +SRT_TRANSCRIPT_CONTENT = u"""0 00:00:10,500 --> 00:00:13,000 Elephant's Dream @@ -160,7 +160,7 @@ class TestUploadTranscripts(BaseTranscripts): super(TestUploadTranscripts, self).setUp() self.contents = { 'good': SRT_TRANSCRIPT_CONTENT, - 'bad': b'Some BAD data', + 'bad': u'Some BAD data', } # Create temporary transcript files self.good_srt_file = self.create_transcript_file(content=self.contents['good'], suffix='.srt') @@ -186,7 +186,7 @@ class TestUploadTranscripts(BaseTranscripts): Setup a transcript file with suffix and content. """ transcript_file = tempfile.NamedTemporaryFile(suffix=suffix) - wrapped_content = textwrap.dedent(content.decode('utf-8')) + wrapped_content = textwrap.dedent(content) if include_bom: wrapped_content = wrapped_content.encode('utf-8-sig') # Verify that ufeff(BOM) character is in content. @@ -791,7 +791,7 @@ class TestDownloadTranscripts(BaseTranscripts): """ self.assertEqual(response.status_code, expected_status_code) if expected_content: - self.assertEqual(response.content, expected_content) + assert response.content.decode('utf-8') == expected_content def test_download_youtube_transcript_success(self): """ diff --git a/cms/djangoapps/contentstore/views/transcript_settings.py b/cms/djangoapps/contentstore/views/transcript_settings.py index d4457a63f6..03105f005b 100644 --- a/cms/djangoapps/contentstore/views/transcript_settings.py +++ b/cms/djangoapps/contentstore/views/transcript_settings.py @@ -229,7 +229,7 @@ def transcript_upload_handler(request): # Convert SRT transcript into an SJSON format # and upload it to S3. sjson_subs = Transcript.convert( - content=transcript_file.read(), + content=transcript_file.read().decode('utf-8'), input_format=Transcript.SRT, output_format=Transcript.SJSON ) diff --git a/cms/djangoapps/contentstore/views/transcripts_ajax.py b/cms/djangoapps/contentstore/views/transcripts_ajax.py index 8c1a76b722..4510aa84ae 100644 --- a/cms/djangoapps/contentstore/views/transcripts_ajax.py +++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py @@ -219,7 +219,7 @@ def upload_transcripts(request): # Convert 'srt' transcript into the 'sjson' and upload it to # configured transcript storage. For example, S3. sjson_subs = Transcript.convert( - content=transcript_file.read(), + content=transcript_file.read().decode('utf-8'), input_format=Transcript.SRT, output_format=Transcript.SJSON ) @@ -322,7 +322,7 @@ def check_transcripts(request): filename = 'subs_{0}.srt.sjson'.format(item.sub) content_location = StaticContent.compute_location(item.location.course_key, filename) try: - local_transcripts = contentstore().find(content_location).data + local_transcripts = contentstore().find(content_location).data.decode('utf-8') transcripts_presence['current_item_subs'] = item.sub except NotFoundError: pass @@ -336,7 +336,7 @@ def check_transcripts(request): filename = 'subs_{0}.srt.sjson'.format(youtube_id) content_location = StaticContent.compute_location(item.location.course_key, filename) try: - local_transcripts = contentstore().find(content_location).data + local_transcripts = contentstore().find(content_location).data.decode('utf-8') transcripts_presence['youtube_local'] = True except NotFoundError: log.debug(u"Can't find transcripts in storage for youtube id: %s", youtube_id) diff --git a/common/lib/xmodule/xmodule/contentstore/mongo.py b/common/lib/xmodule/xmodule/contentstore/mongo.py index 5ff01ddfae..ae747f97fa 100644 --- a/common/lib/xmodule/xmodule/contentstore/mongo.py +++ b/common/lib/xmodule/xmodule/contentstore/mongo.py @@ -101,12 +101,19 @@ class MongoContentStore(ContentStore): locked=getattr(content, 'locked', False)) as fp: # It seems that this code thought that only some specific object would have the `__iter__` attribute - # but the bytes object in python 3 has one and should not use the chunking logic. - if hasattr(content.data, '__iter__') and not isinstance(content.data, six.binary_type): + # but many more objects have this in python3 and shouldn't be using the chunking logic. For string and + # byte streams we write them directly to gridfs and convert them to byetarrys if necessary. + if hasattr(content.data, '__iter__') and not isinstance(content.data, (six.binary_type, six.string_types)): for chunk in content.data: fp.write(chunk) else: - fp.write(content.data) + # Ideally we could just ensure that we don't get strings in here and only byte streams + # but being confident of that wolud be a lot more work than we have time for so we just + # handle both cases here. + if isinstance(content.data, six.text_type): + fp.write(content.data.encode('utf-8')) + else: + fp.write(content.data) return content diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py index 1e334ffc20..aa1b34c101 100644 --- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py +++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py @@ -145,7 +145,7 @@ def youtube_video_transcript_name(youtube_text_api): # http://video.google.com/timedtext?type=list&v={VideoId} youtube_response = requests.get('http://' + youtube_text_api['url'], params=transcripts_param) if youtube_response.status_code == 200 and youtube_response.text: - youtube_data = etree.fromstring(youtube_response.content.encode('utf-8'), parser=utf8_parser) + youtube_data = etree.fromstring(youtube_response.text.encode('utf-8'), parser=utf8_parser) # iterate all transcripts information from youtube server for element in youtube_data: # search specific language code such as 'en' in transcripts info list @@ -579,6 +579,8 @@ def get_video_transcript_content(edx_video_id, language_code): edx_video_id = clean_video_id(edx_video_id) if edxval_api and edx_video_id: transcript = edxval_api.get_video_transcript_data(edx_video_id, language_code) + if transcript and 'content' in transcript: + transcript['content'] = transcript['content'].decode('utf-8') return transcript @@ -654,7 +656,7 @@ class Transcript(object): if input_format == 'srt': if output_format == 'txt': - text = SubRipFile.from_string(content.decode('utf-8')).text + text = SubRipFile.from_string(content).text return HTMLParser().unescape(text) elif output_format == 'sjson': @@ -663,7 +665,7 @@ class Transcript(object): # the exception if something went wrong in parsing the transcript. srt_subs = SubRipFile.from_string( # Skip byte order mark(BOM) character - content.decode('utf-8-sig') if six.PY2 else content.encode('utf-8').decode('utf-8-sig'), + content.encode('utf-8').decode('utf-8-sig'), error_handling=SubRipFile.ERROR_RAISE ) except Error as ex: # Base exception from pysrt @@ -925,11 +927,11 @@ def get_transcript_for_video(video_location, subs_id, file_name, language): try: if subs_id is None: raise NotFoundError - content = Transcript.asset(video_location, subs_id, language).data + content = Transcript.asset(video_location, subs_id, language).data.decode('utf-8') base_name = subs_id input_format = Transcript.SJSON except NotFoundError: - content = Transcript.asset(video_location, None, language, file_name).data + content = Transcript.asset(video_location, None, language, file_name).data.decode('utf-8') base_name = os.path.splitext(file_name)[0] input_format = Transcript.SRT