Merge pull request #17976 from edx/mushtaq/html5_sources_transcript

Update the get_transcript util with html5_sources
2018-04-25 21:41:34 +05:00
parent d428fa96b8 b181e6bb45
commit 1ba833f3f7
3 changed files with 81 additions and 43 deletions
--- a/cms/djangoapps/contentstore/management/commands/tests/test_migrate_transcripts.py
+++ b/cms/djangoapps/contentstore/management/commands/tests/test_migrate_transcripts.py
@@ -261,10 +261,7 @@ class TestMigrateTranscripts(ModuleStoreTestCase):
             u'[Transcript migration] process for ge transcript started'),
            (LOGGER_NAME,
             'ERROR',
-             '[Transcript migration] Exception: u"SON(['
-             '(\'category\', \'asset\'), (\'name\', u\'not_found.srt\'),'
-             ' (\'course\', u\'{}\'), (\'tag\', \'c4x\'), (\'org\', u\'{}\'),'
-             ' (\'revision\', None)])"'.format(self.course_2.id.course, self.course_2.id.org)),
+             "[Transcript migration] Exception: u'No transcript for `ge` language'"),
            (LOGGER_NAME,
             'INFO',
             u'[Transcript migration] process for course {} ended. Processed 1 transcripts'.format(
@@ -272,11 +269,8 @@ class TestMigrateTranscripts(ModuleStoreTestCase):
             )),
            (LOGGER_NAME,
             'INFO',
-             "[Transcript migration] Result: Failed: language ge of video test_edx_video_id_2 with exception SON(["
-             "('category', 'asset'), ('name', u'not_found.srt'), ('course', u'{}'),"
-             " ('tag', 'c4x'), ('org', u'{}'), ('revision', None)])".format(
-                 self.course_2.id.course, self.course_2.id.org)
-             )
+             "[Transcript migration] Result: Failed: language ge of video test_edx_video_id_2 with exception "
+             "No transcript for `ge` language")
        )

        with LogCapture(LOGGER_NAME, level=logging.INFO) as logger:
--- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py
+++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py
@@ -744,7 +744,7 @@ class TestGetTranscript(SharedModuleStoreTestCase):
            edx_video_id=u'1234-5678-90'
        )

-    def create_transcript(self, subs_id, language=u'en', filename='video.srt'):
+    def create_transcript(self, subs_id, language=u'en', filename='video.srt', youtube_id_1_0='', html5_sources=None):
        """
        create transcript.
        """
@@ -752,21 +752,26 @@ class TestGetTranscript(SharedModuleStoreTestCase):
        if language != u'en':
            transcripts = {language: filename}

+        html5_sources = html5_sources or []
        self.video = ItemFactory.create(
            category='video',
            parent_location=self.vertical.location,
            sub=subs_id,
+            youtube_id_1_0=youtube_id_1_0,
            transcripts=transcripts,
-            edx_video_id=u'1234-5678-90'
+            edx_video_id=u'1234-5678-90',
+            html5_sources=html5_sources
        )

-        if subs_id:
-            transcripts_utils.save_subs_to_store(
-                self.subs_sjson,
-                subs_id,
-                self.video,
-                language=language,
-            )
+        possible_subs = [subs_id, youtube_id_1_0] + transcripts_utils.get_html5_ids(html5_sources)
+        for possible_sub in possible_subs:
+            if possible_sub:
+                transcripts_utils.save_subs_to_store(
+                    self.subs_sjson,
+                    possible_sub,
+                    self.video,
+                    language=language,
+                )

    def create_srt_file(self, content):
        """
@@ -812,31 +817,69 @@ class TestGetTranscript(SharedModuleStoreTestCase):
            )

    @ddt.data(
+        # video.sub transcript
        {
            'language': u'en',
            'subs_id': 'video_101',
-            'filename': 'en_video_101.srt',
+            'youtube_id_1_0': '',
+            'html5_sources': [],
+            'expected_filename': 'en_video_101.srt',
        },
+        # if video.sub is present, rest will be skipped.
+        {
+            'language': u'en',
+            'subs_id': 'video_101',
+            'youtube_id_1_0': 'test_yt_id',
+            'html5_sources': ['www.abc.com/foo.mp4'],
+            'expected_filename': 'en_video_101.srt',
+        },
+        # video.youtube_id_1_0 transcript
+        {
+            'language': u'en',
+            'subs_id': '',
+            'youtube_id_1_0': 'test_yt_id',
+            'html5_sources': [],
+            'expected_filename': 'en_test_yt_id.srt',
+        },
+        # video.html5_sources transcript
+        {
+            'language': u'en',
+            'subs_id': '',
+            'youtube_id_1_0': '',
+            'html5_sources': ['www.abc.com/foo.mp4'],
+            'expected_filename': 'en_foo.srt',
+        },
+        # non-english transcript
        {
            'language': u'ur',
            'subs_id': '',
-            'filename': 'ur_video_101.srt',
+            'youtube_id_1_0': '',
+            'html5_sources': [],
+            'expected_filename': 'ur_video_101.srt',
        },
    )
    @ddt.unpack
-    def test_get_transcript_from_content_store(self, language, subs_id, filename):
+    def test_get_transcript_from_contentstore(
+        self,
+        language,
+        subs_id,
+        youtube_id_1_0,
+        html5_sources,
+        expected_filename
+    ):
        """
        Verify that `get_transcript` function returns correct data when transcript is in content store.
        """
-        self.upload_file(self.create_srt_file(self.subs_srt), self.video.location, filename)
-        self.create_transcript(subs_id, language, filename)
-        content, filename, mimetype = transcripts_utils.get_transcript(
+        base_filename = 'video_101.srt'
+        self.upload_file(self.create_srt_file(self.subs_srt), self.video.location, base_filename)
+        self.create_transcript(subs_id, language, base_filename, youtube_id_1_0, html5_sources)
+        content, file_name, mimetype = transcripts_utils.get_transcript(
            self.video,
            language
        )

        self.assertEqual(content, self.subs[language])
-        self.assertEqual(filename, filename)
+        self.assertEqual(file_name, expected_filename)
        self.assertEqual(mimetype, self.srt_mime_type)

    def test_get_transcript_from_content_store_for_ur(self):
--- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
+++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
@@ -859,7 +859,7 @@ def get_transcript_from_val(edx_video_id, lang=None, output_format=Transcript.SR
    """
    Get video transcript from edx-val.
    Arguments:
-        edx_video_id (unicode): course identifier
+        edx_video_id (unicode): video identifier
        lang (unicode): transcript language
        output_format (unicode): transcript output format
    Returns:
@@ -923,6 +923,7 @@ def get_transcript_from_contentstore(video, language, output_format, transcripts
    Returns:
        tuple containing content, filename, mimetype
    """
+    input_format, base_name, transcript_content = None, None, None
    if output_format not in (Transcript.SRT, Transcript.SJSON, Transcript.TXT):
        raise NotFoundError('Invalid transcript format `{output_format}`'.format(output_format=output_format))

@@ -930,24 +931,24 @@ def get_transcript_from_contentstore(video, language, output_format, transcripts
    transcripts = dict(other_languages)

    # this is sent in case of a translation dispatch and we need to use it as our subs_id.
-    if youtube_id:
-        transcripts['en'] = youtube_id
-    elif sub:
-        transcripts['en'] = sub
-    elif video.youtube_id_1_0:
-        transcripts['en'] = video.youtube_id_1_0
-    elif language == u'en':
-        raise NotFoundError('No transcript for `en` language')
+    possible_sub_ids = [youtube_id, sub, video.youtube_id_1_0] + get_html5_ids(video.html5_sources)
+    for sub_id in possible_sub_ids:
+        try:
+            transcripts[u'en'] = sub_id
+            input_format, base_name, transcript_content = get_transcript_for_video(
+                video.location,
+                subs_id=sub_id,
+                file_name=transcripts[language],
+                language=language
+            )
+            break
+        except (KeyError, NotFoundError):
+            continue

-    try:
-        input_format, base_name, transcript_content = get_transcript_for_video(
-            video.location,
-            subs_id=transcripts.get('en'),
-            file_name=transcripts[language],
-            language=language
-        )
-    except KeyError:
-        raise NotFoundError
+    if transcript_content is None:
+        raise NotFoundError('No transcript for `{lang}` language'.format(
+            lang=language
+        ))

    # add language prefix to transcript file only if language is not None
    language_prefix = '{}_'.format(language) if language else ''