From 0d26738fcedec1911840b18a4a6e6e01103159bb Mon Sep 17 00:00:00 2001 From: connorhaugh <49422820+connorhaugh@users.noreply.github.com> Date: Wed, 21 Jul 2021 15:52:54 -0400 Subject: [PATCH] fix: Prevent Transcripts from Failing Export, Edit (#28242) Instead of having json errors in transcript acquisition and conversion cause errors, have transcription conversion and acquisition simply return an error message in the transcription which can prompt a change from the user. Although not uploading a transcript is handled, transcripts can often cause errors in edit, export, and other activities due to json errors. These errors block the entire use of these features, so to allow for reupload, etc, we add an error message instead of transcript and log the event. In response to [TNL-8539](https://openedx.atlassian.net/secure/RapidBoard.jspa?rapidView=580&projectKey=TNL&modal=detail&selectedIssue=TNL-8539) Testing: Unit tests coverage is included in the PR. Upload, import, and export of courses with transcriptions is also easily hand-testable. Just create a video in studio, add an irrelevant transcript. Then try to import, export, and edit the problem. Expected behavior is success. --- .../tests/test_transcripts_utils.py | 16 +++++++++++ .../xmodule/video_module/transcripts_utils.py | 27 ++++++++++++++----- .../courseware/tests/test_video_handlers.py | 7 ++--- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py index d4d8277632..216b31651b 100644 --- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py +++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py @@ -640,6 +640,12 @@ class TestTranscript(unittest.TestCase): with self.assertRaises(transcripts_utils.TranscriptsGenerationException): transcripts_utils.Transcript.convert(invalid_srt_transcript, 'srt', 'sjson') + def test_convert_invalid_invalid_sjson_to_srt(self): + invalid_content = "Text with special character /\"\'\b\f\t\r\n." + error_transcript = {"start": [1], "end": [2], "text": ["An error occured obtaining the transcript."]} + assert transcripts_utils.Transcript.convert(invalid_content, 'sjson', 'txt') == error_transcript['text'][0] + assert error_transcript["text"][0] in transcripts_utils.Transcript.convert(invalid_content, 'sjson', 'srt') + def test_dummy_non_existent_transcript(self): """ Test `Transcript.asset` raises `NotFoundError` for dummy non-existent transcript. @@ -995,6 +1001,16 @@ class TestGetTranscript(SharedModuleStoreTestCase): exception_message = str(no_en_transcript_exception.exception) self.assertEqual(exception_message, 'No transcript for `en` language') + @patch('xmodule.video_module.transcripts_utils.edxval_api.get_video_transcript_data') + def test_get_transcript_incorrect_json_(self, mock_get_video_transcript_data): + """ + Verify that `get transcript` function returns a working json file if the original throws an error + """ + error_transcript = {"start": [1], "end": [2], "text": ["An error occured obtaining the transcript."]} + mock_get_video_transcript_data.side_effect = ValueError + content, _, _ = transcripts_utils.get_transcript(self.video, 'zh') + assert error_transcript["text"][0] in content + @ddt.data( transcripts_utils.TranscriptsGenerationException, UnicodeDecodeError('aliencodec', b'\x02\x01', 1, 2, 'alien codec found!') diff --git a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py index 7abe134de1..c33b70a2ad 100644 --- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py +++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py @@ -577,8 +577,17 @@ def get_video_transcript_content(edx_video_id, language_code): transcript = None edx_video_id = clean_video_id(edx_video_id) if edxval_api and edx_video_id: - transcript = edxval_api.get_video_transcript_data(edx_video_id, language_code) - + try: + transcript = edxval_api.get_video_transcript_data(edx_video_id, language_code) + except ValueError: + log.exception( + f"Error getting transcript from edx-val id: {edx_video_id}: language code {language_code}" + ) + content = '{"start": [1],"end": [2],"text": ["An error occured obtaining the transcript."]}' + transcript = dict( + file_name='error-{edx_video_id}-{language_code}.srt', + content=Transcript.convert(content, 'sjson', 'srt') + ) return transcript @@ -680,14 +689,20 @@ class Transcript: content_str = content.decode('latin-1') else: content_str = content - + try: + content_dict = json.loads(content_str) + except ValueError: + truncated = content_str[:100].strip() + log.exception( + f"Failed to convert {input_format} to {output_format} for {repr(truncated)}..." + ) + content_dict = {"start": [1], "end": [2], "text": ["An error occured obtaining the transcript."]} if output_format == 'txt': - text = json.loads(content_str)['text'] + text = content_dict['text'] text_without_none = [line if line else '' for line in text] return html.unescape("\n".join(text_without_none)) - elif output_format == 'srt': - return generate_srt_from_sjson(json.loads(content_str), speed=1.0) + return generate_srt_from_sjson(content_dict, speed=1.0) @staticmethod def asset(location, subs_id, lang='en', filename=None): diff --git a/lms/djangoapps/courseware/tests/test_video_handlers.py b/lms/djangoapps/courseware/tests/test_video_handlers.py index 0d266f5e8f..06db753d12 100644 --- a/lms/djangoapps/courseware/tests/test_video_handlers.py +++ b/lms/djangoapps/courseware/tests/test_video_handlers.py @@ -1296,15 +1296,16 @@ class TestGetTranscript(TestVideo): # lint-amnesty, pylint: disable=test-inheri assert filename == 'zh_塞.srt' assert mime_type == 'application/x-subrip; charset=utf-8' - def test_value_error(self): + def test_value_error_handled(self): good_sjson = _create_file(content='bad content') _upload_sjson_file(good_sjson, self.item.location) self.item.sub = _get_subs_id(good_sjson.name) transcripts = self.item.get_transcripts_info() # lint-amnesty, pylint: disable=unused-variable - with pytest.raises(ValueError): - get_transcript(self.item) + error_transcript = {"start": [], "end": [], "text": ["An error occured obtaining the transcript."]} + content, _, _ = get_transcript(self.item) + assert error_transcript["text"][0] in content def test_key_error(self): good_sjson = _create_file(content="""