Merge pull request #17976 from edx/mushtaq/html5_sources_transcript

Update the get_transcript util with html5_sources
This commit is contained in:
Mushtaq Ali
2018-04-25 21:41:34 +05:00
committed by GitHub
3 changed files with 81 additions and 43 deletions

View File

@@ -261,10 +261,7 @@ class TestMigrateTranscripts(ModuleStoreTestCase):
u'[Transcript migration] process for ge transcript started'),
(LOGGER_NAME,
'ERROR',
'[Transcript migration] Exception: u"SON(['
'(\'category\', \'asset\'), (\'name\', u\'not_found.srt\'),'
' (\'course\', u\'{}\'), (\'tag\', \'c4x\'), (\'org\', u\'{}\'),'
' (\'revision\', None)])"'.format(self.course_2.id.course, self.course_2.id.org)),
"[Transcript migration] Exception: u'No transcript for `ge` language'"),
(LOGGER_NAME,
'INFO',
u'[Transcript migration] process for course {} ended. Processed 1 transcripts'.format(
@@ -272,11 +269,8 @@ class TestMigrateTranscripts(ModuleStoreTestCase):
)),
(LOGGER_NAME,
'INFO',
"[Transcript migration] Result: Failed: language ge of video test_edx_video_id_2 with exception SON(["
"('category', 'asset'), ('name', u'not_found.srt'), ('course', u'{}'),"
" ('tag', 'c4x'), ('org', u'{}'), ('revision', None)])".format(
self.course_2.id.course, self.course_2.id.org)
)
"[Transcript migration] Result: Failed: language ge of video test_edx_video_id_2 with exception "
"No transcript for `ge` language")
)
with LogCapture(LOGGER_NAME, level=logging.INFO) as logger:

View File

@@ -744,7 +744,7 @@ class TestGetTranscript(SharedModuleStoreTestCase):
edx_video_id=u'1234-5678-90'
)
def create_transcript(self, subs_id, language=u'en', filename='video.srt'):
def create_transcript(self, subs_id, language=u'en', filename='video.srt', youtube_id_1_0='', html5_sources=None):
"""
create transcript.
"""
@@ -752,21 +752,26 @@ class TestGetTranscript(SharedModuleStoreTestCase):
if language != u'en':
transcripts = {language: filename}
html5_sources = html5_sources or []
self.video = ItemFactory.create(
category='video',
parent_location=self.vertical.location,
sub=subs_id,
youtube_id_1_0=youtube_id_1_0,
transcripts=transcripts,
edx_video_id=u'1234-5678-90'
edx_video_id=u'1234-5678-90',
html5_sources=html5_sources
)
if subs_id:
transcripts_utils.save_subs_to_store(
self.subs_sjson,
subs_id,
self.video,
language=language,
)
possible_subs = [subs_id, youtube_id_1_0] + transcripts_utils.get_html5_ids(html5_sources)
for possible_sub in possible_subs:
if possible_sub:
transcripts_utils.save_subs_to_store(
self.subs_sjson,
possible_sub,
self.video,
language=language,
)
def create_srt_file(self, content):
"""
@@ -812,31 +817,69 @@ class TestGetTranscript(SharedModuleStoreTestCase):
)
@ddt.data(
# video.sub transcript
{
'language': u'en',
'subs_id': 'video_101',
'filename': 'en_video_101.srt',
'youtube_id_1_0': '',
'html5_sources': [],
'expected_filename': 'en_video_101.srt',
},
# if video.sub is present, rest will be skipped.
{
'language': u'en',
'subs_id': 'video_101',
'youtube_id_1_0': 'test_yt_id',
'html5_sources': ['www.abc.com/foo.mp4'],
'expected_filename': 'en_video_101.srt',
},
# video.youtube_id_1_0 transcript
{
'language': u'en',
'subs_id': '',
'youtube_id_1_0': 'test_yt_id',
'html5_sources': [],
'expected_filename': 'en_test_yt_id.srt',
},
# video.html5_sources transcript
{
'language': u'en',
'subs_id': '',
'youtube_id_1_0': '',
'html5_sources': ['www.abc.com/foo.mp4'],
'expected_filename': 'en_foo.srt',
},
# non-english transcript
{
'language': u'ur',
'subs_id': '',
'filename': 'ur_video_101.srt',
'youtube_id_1_0': '',
'html5_sources': [],
'expected_filename': 'ur_video_101.srt',
},
)
@ddt.unpack
def test_get_transcript_from_content_store(self, language, subs_id, filename):
def test_get_transcript_from_contentstore(
self,
language,
subs_id,
youtube_id_1_0,
html5_sources,
expected_filename
):
"""
Verify that `get_transcript` function returns correct data when transcript is in content store.
"""
self.upload_file(self.create_srt_file(self.subs_srt), self.video.location, filename)
self.create_transcript(subs_id, language, filename)
content, filename, mimetype = transcripts_utils.get_transcript(
base_filename = 'video_101.srt'
self.upload_file(self.create_srt_file(self.subs_srt), self.video.location, base_filename)
self.create_transcript(subs_id, language, base_filename, youtube_id_1_0, html5_sources)
content, file_name, mimetype = transcripts_utils.get_transcript(
self.video,
language
)
self.assertEqual(content, self.subs[language])
self.assertEqual(filename, filename)
self.assertEqual(file_name, expected_filename)
self.assertEqual(mimetype, self.srt_mime_type)
def test_get_transcript_from_content_store_for_ur(self):

View File

@@ -859,7 +859,7 @@ def get_transcript_from_val(edx_video_id, lang=None, output_format=Transcript.SR
"""
Get video transcript from edx-val.
Arguments:
edx_video_id (unicode): course identifier
edx_video_id (unicode): video identifier
lang (unicode): transcript language
output_format (unicode): transcript output format
Returns:
@@ -923,6 +923,7 @@ def get_transcript_from_contentstore(video, language, output_format, transcripts
Returns:
tuple containing content, filename, mimetype
"""
input_format, base_name, transcript_content = None, None, None
if output_format not in (Transcript.SRT, Transcript.SJSON, Transcript.TXT):
raise NotFoundError('Invalid transcript format `{output_format}`'.format(output_format=output_format))
@@ -930,24 +931,24 @@ def get_transcript_from_contentstore(video, language, output_format, transcripts
transcripts = dict(other_languages)
# this is sent in case of a translation dispatch and we need to use it as our subs_id.
if youtube_id:
transcripts['en'] = youtube_id
elif sub:
transcripts['en'] = sub
elif video.youtube_id_1_0:
transcripts['en'] = video.youtube_id_1_0
elif language == u'en':
raise NotFoundError('No transcript for `en` language')
possible_sub_ids = [youtube_id, sub, video.youtube_id_1_0] + get_html5_ids(video.html5_sources)
for sub_id in possible_sub_ids:
try:
transcripts[u'en'] = sub_id
input_format, base_name, transcript_content = get_transcript_for_video(
video.location,
subs_id=sub_id,
file_name=transcripts[language],
language=language
)
break
except (KeyError, NotFoundError):
continue
try:
input_format, base_name, transcript_content = get_transcript_for_video(
video.location,
subs_id=transcripts.get('en'),
file_name=transcripts[language],
language=language
)
except KeyError:
raise NotFoundError
if transcript_content is None:
raise NotFoundError('No transcript for `{lang}` language'.format(
lang=language
))
# add language prefix to transcript file only if language is not None
language_prefix = '{}_'.format(language) if language else ''