Merge pull request #17605 from edx/iahmad/migrate_transcripts_S3
Iahmad/migrate transcripts s3
This commit is contained in:
@@ -0,0 +1,124 @@
|
||||
"""
|
||||
Command to migrate transcripts to django storage.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from django.core.management import BaseCommand, CommandError
|
||||
from opaque_keys import InvalidKeyError
|
||||
from opaque_keys.edx.keys import CourseKey
|
||||
from opaque_keys.edx.locator import CourseLocator
|
||||
from cms.djangoapps.contentstore.tasks import (
|
||||
DEFAULT_ALL_COURSES,
|
||||
DEFAULT_FORCE_UPDATE,
|
||||
DEFAULT_COMMIT,
|
||||
enqueue_async_migrate_transcripts_tasks
|
||||
)
|
||||
from openedx.core.lib.command_utils import get_mutually_exclusive_required_option, parse_course_keys
|
||||
from openedx.core.djangoapps.video_config.models import TranscriptMigrationSetting
|
||||
from xmodule.modulestore.django import modulestore
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
Example usage:
|
||||
$ ./manage.py cms migrate_transcripts --all-courses --force-update --commit
|
||||
$ ./manage.py cms migrate_transcripts --course-id 'Course1' --course-id 'Course2' --commit
|
||||
$ ./manage.py cms migrate_transcripts --from-settings
|
||||
"""
|
||||
args = '<course_id course_id ...>'
|
||||
help = 'Migrates transcripts to S3 for one or more courses.'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""
|
||||
Add arguments to the command parser.
|
||||
"""
|
||||
parser.add_argument(
|
||||
'--course-id', '--course_id',
|
||||
dest='course_ids',
|
||||
action='append',
|
||||
help=u'Migrates transcripts for the list of courses.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--all-courses', '--all', '--all_courses',
|
||||
dest='all_courses',
|
||||
action='store_true',
|
||||
default=DEFAULT_ALL_COURSES,
|
||||
help=u'Migrates transcripts to the configured django storage for all courses.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--from-settings', '--from_settings',
|
||||
dest='from_settings',
|
||||
help='Migrate Transcripts with settings set via django admin',
|
||||
action='store_true',
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--force-update', '--force_update',
|
||||
dest='force_update',
|
||||
action='store_true',
|
||||
default=DEFAULT_FORCE_UPDATE,
|
||||
help=u'Force migrate transcripts for the requested courses, overwrite if already present.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--commit',
|
||||
dest='commit',
|
||||
action='store_true',
|
||||
default=DEFAULT_COMMIT,
|
||||
help=u'Commits the discovered video transcripts to django storage. '
|
||||
u'Without this flag, the command will return the transcripts discovered for migration.'
|
||||
)
|
||||
|
||||
def _parse_course_key(self, raw_value):
|
||||
""" Parses course key from string """
|
||||
try:
|
||||
result = CourseKey.from_string(raw_value)
|
||||
except InvalidKeyError:
|
||||
raise CommandError("Invalid course_key: '%s'." % raw_value)
|
||||
|
||||
if not isinstance(result, CourseLocator):
|
||||
raise CommandError(u"Argument {0} is not a course key".format(raw_value))
|
||||
|
||||
return result
|
||||
|
||||
def _get_migration_options(self, options):
|
||||
"""
|
||||
Returns the command arguments configured via django admin.
|
||||
"""
|
||||
force_update = options['force_update']
|
||||
commit = options['commit']
|
||||
courses_mode = get_mutually_exclusive_required_option(options, 'course_ids', 'all_courses', 'from_settings')
|
||||
if courses_mode == 'all_courses':
|
||||
course_keys = [course.id for course in modulestore().get_course_summaries()]
|
||||
elif courses_mode == 'course_ids':
|
||||
course_keys = map(self._parse_course_key, options['course_ids'])
|
||||
else:
|
||||
if self._latest_settings().all_courses:
|
||||
course_keys = [course.id for course in modulestore().get_course_summaries()]
|
||||
else:
|
||||
course_keys = parse_course_keys(self._latest_settings().course_ids.split())
|
||||
force_update = self._latest_settings().force_update
|
||||
commit = self._latest_settings().commit
|
||||
|
||||
return course_keys, force_update, commit
|
||||
|
||||
def _latest_settings(self):
|
||||
"""
|
||||
Return the latest version of the TranscriptMigrationSetting
|
||||
"""
|
||||
return TranscriptMigrationSetting.current()
|
||||
|
||||
def handle(self, *args, **options):
|
||||
"""
|
||||
Invokes the migrate transcripts enqueue function.
|
||||
"""
|
||||
course_keys, force_update, commit = self._get_migration_options(options)
|
||||
kwargs = {'force_update': force_update, 'commit': commit}
|
||||
try:
|
||||
enqueue_async_migrate_transcripts_tasks(
|
||||
course_keys,
|
||||
**kwargs
|
||||
)
|
||||
except InvalidKeyError as exc:
|
||||
raise CommandError(u'Invalid course key: ' + unicode(exc))
|
||||
@@ -0,0 +1,286 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for course transcript migration management command.
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
from django.test import TestCase
|
||||
from django.core.management import call_command, CommandError
|
||||
from xmodule.modulestore.django import modulestore
|
||||
from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase
|
||||
from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory
|
||||
from xmodule.video_module.transcripts_utils import save_to_store
|
||||
from edxval import api as api
|
||||
from testfixtures import LogCapture
|
||||
|
||||
LOGGER_NAME = "cms.djangoapps.contentstore.tasks"
|
||||
|
||||
SRT_FILEDATA = '''
|
||||
0
|
||||
00:00:00,270 --> 00:00:02,720
|
||||
sprechen sie deutsch?
|
||||
|
||||
1
|
||||
00:00:02,720 --> 00:00:05,430
|
||||
Ja, ich spreche Deutsch
|
||||
|
||||
2
|
||||
00:00:6,500 --> 00:00:08,600
|
||||
可以用“我不太懂艺术 但我知道我喜欢什么”做比喻
|
||||
'''
|
||||
|
||||
CRO_SRT_FILEDATA = '''
|
||||
0
|
||||
00:00:00,270 --> 00:00:02,720
|
||||
Dobar dan!
|
||||
|
||||
1
|
||||
00:00:02,720 --> 00:00:05,430
|
||||
Kako ste danas?
|
||||
|
||||
2
|
||||
00:00:6,500 --> 00:00:08,600
|
||||
可以用“我不太懂艺术 但我知道我喜欢什么”做比喻
|
||||
'''
|
||||
|
||||
|
||||
VIDEO_DICT_STAR = dict(
|
||||
client_video_id='TWINKLE TWINKLE',
|
||||
duration=42.0,
|
||||
edx_video_id='test_edx_video_id',
|
||||
status='upload',
|
||||
)
|
||||
|
||||
|
||||
class TestArgParsing(TestCase):
|
||||
"""
|
||||
Tests for parsing arguments for the `migrate_transcripts` management command
|
||||
"""
|
||||
def test_no_args(self):
|
||||
errstring = "Must specify exactly one of --course_ids, --all_courses, --from_settings"
|
||||
with self.assertRaisesRegexp(CommandError, errstring):
|
||||
call_command('migrate_transcripts')
|
||||
|
||||
def test_invalid_course(self):
|
||||
errstring = "Invalid course_key: 'invalid-course'."
|
||||
with self.assertRaisesRegexp(CommandError, errstring):
|
||||
call_command('migrate_transcripts', '--course-id', 'invalid-course')
|
||||
|
||||
|
||||
class TestMigrateTranscripts(ModuleStoreTestCase):
|
||||
"""
|
||||
Tests migrating video transcripts in courses from contentstore to django storage
|
||||
"""
|
||||
def setUp(self):
|
||||
""" Common setup. """
|
||||
super(TestMigrateTranscripts, self).setUp()
|
||||
self.store = modulestore()
|
||||
self.course = CourseFactory.create()
|
||||
self.course_2 = CourseFactory.create()
|
||||
|
||||
video = {
|
||||
'edx_video_id': 'test_edx_video_id',
|
||||
'client_video_id': 'test1.mp4',
|
||||
'duration': 42.0,
|
||||
'status': 'upload',
|
||||
'courses': [unicode(self.course.id)],
|
||||
'encoded_videos': [],
|
||||
'created': datetime.now(pytz.utc)
|
||||
}
|
||||
api.create_video(video)
|
||||
|
||||
video_sample_xml = '''
|
||||
<video display_name="Test Video"
|
||||
edx_video_id="test_edx_video_id"
|
||||
youtube="1.0:p2Q6BrNhdh8,0.75:izygArpw-Qo,1.25:1EeWXzPdhSA,1.5:rABDYkeK0x8"
|
||||
show_captions="false"
|
||||
download_track="false"
|
||||
start_time="00:00:01"
|
||||
download_video="false"
|
||||
end_time="00:01:00">
|
||||
<source src="http://www.example.com/source.mp4"/>
|
||||
<track src="http://www.example.com/track"/>
|
||||
<handout src="http://www.example.com/handout"/>
|
||||
<transcript language="ge" src="subs_grmtran1.srt" />
|
||||
<transcript language="hr" src="subs_croatian1.srt" />
|
||||
</video>
|
||||
'''
|
||||
|
||||
video_sample_xml_2 = '''
|
||||
<video display_name="Test Video 2"
|
||||
edx_video_id="test_edx_video_id_2"
|
||||
youtube="1.0:p2Q6BrNhdh8,0.75:izygArpw-Qo,1.25:1EeWXzPdhSA,1.5:rABDYkeK0x8"
|
||||
show_captions="false"
|
||||
download_track="false"
|
||||
start_time="00:00:01"
|
||||
download_video="false"
|
||||
end_time="00:01:00">
|
||||
<source src="http://www.example.com/source.mp4"/>
|
||||
<track src="http://www.example.com/track"/>
|
||||
<handout src="http://www.example.com/handout"/>
|
||||
<transcript language="ge" src="not_found.srt" />
|
||||
</video>
|
||||
'''
|
||||
self.video_descriptor = ItemFactory.create(
|
||||
parent_location=self.course.location, category='video',
|
||||
data={'data': video_sample_xml}
|
||||
)
|
||||
self.video_descriptor_2 = ItemFactory.create(
|
||||
parent_location=self.course_2.location, category='video',
|
||||
data={'data': video_sample_xml_2}
|
||||
)
|
||||
|
||||
save_to_store(SRT_FILEDATA, 'subs_grmtran1.srt', 'text/srt', self.video_descriptor.location)
|
||||
save_to_store(CRO_SRT_FILEDATA, 'subs_croatian1.srt', 'text/srt', self.video_descriptor.location)
|
||||
|
||||
def test_migrated_transcripts_count_with_commit(self):
|
||||
"""
|
||||
Test migrating transcripts with commit
|
||||
"""
|
||||
# check that transcript does not exist
|
||||
languages = api.get_available_transcript_languages(self.video_descriptor.edx_video_id)
|
||||
self.assertEqual(len(languages), 0)
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
# now call migrate_transcripts command and check the transcript availability
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course.id), '--commit')
|
||||
|
||||
languages = api.get_available_transcript_languages(self.video_descriptor.edx_video_id)
|
||||
self.assertEqual(len(languages), 2)
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
def test_migrated_transcripts_without_commit(self):
|
||||
"""
|
||||
Test migrating transcripts as a dry-run
|
||||
"""
|
||||
# check that transcripts do not exist
|
||||
languages = api.get_available_transcript_languages(self.video_descriptor.edx_video_id)
|
||||
self.assertEqual(len(languages), 0)
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
# now call migrate_transcripts command and check the transcript availability
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course.id))
|
||||
|
||||
# check that transcripts still do not exist
|
||||
languages = api.get_available_transcript_languages(self.video_descriptor.edx_video_id)
|
||||
self.assertEqual(len(languages), 0)
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
def test_migrate_transcripts_availability(self):
|
||||
"""
|
||||
Test migrating transcripts
|
||||
"""
|
||||
translations = self.video_descriptor.available_translations(self.video_descriptor.get_transcripts_info())
|
||||
self.assertItemsEqual(translations, ['hr', 'ge'])
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
# now call migrate_transcripts command and check the transcript availability
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course.id), '--commit')
|
||||
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
def test_migrate_transcripts_idempotency(self):
|
||||
"""
|
||||
Test migrating transcripts multiple times
|
||||
"""
|
||||
translations = self.video_descriptor.available_translations(self.video_descriptor.get_transcripts_info())
|
||||
self.assertItemsEqual(translations, ['hr', 'ge'])
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertFalse(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
# now call migrate_transcripts command and check the transcript availability
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course.id), '--commit')
|
||||
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
# now call migrate_transcripts command again and check the transcript availability
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course.id), '--commit')
|
||||
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
# now call migrate_transcripts command with --force-update and check the transcript availability
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course.id), '--force-update', '--commit')
|
||||
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'hr'))
|
||||
self.assertTrue(api.is_transcript_available(self.video_descriptor.edx_video_id, 'ge'))
|
||||
|
||||
def test_migrate_transcripts_logging(self):
|
||||
"""
|
||||
Test migrate transcripts logging and output
|
||||
"""
|
||||
expected_log = (
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
u'[Transcript migration] process for course {} started. Migrating 1 videos'.format(
|
||||
unicode(self.course.id)
|
||||
)),
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
'[Transcript migration] Migrating 2 transcripts'),
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
u'[Transcript migration] process for course {} ended. Processed 2 transcripts'.format(
|
||||
unicode(self.course.id)
|
||||
)),
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
'[Transcript migration] Result: Language hr transcript of video test_edx_video_id will be migrated'
|
||||
'\nLanguage ge transcript of video test_edx_video_id will be migrated')
|
||||
)
|
||||
|
||||
with LogCapture(LOGGER_NAME, level=logging.INFO) as logger:
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course.id))
|
||||
logger.check(
|
||||
*expected_log
|
||||
)
|
||||
|
||||
def test_migrate_transcripts_exception_logging(self):
|
||||
"""
|
||||
Test migrate transcripts exception logging
|
||||
"""
|
||||
expected_log = (
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
u'[Transcript migration] process for course {} started. Migrating 1 videos'.format(
|
||||
unicode(self.course_2.id)
|
||||
)),
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
'[Transcript migration] Migrating 1 transcripts'),
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
u'[Transcript migration] process for ge transcript started'),
|
||||
(LOGGER_NAME,
|
||||
'ERROR',
|
||||
'[Transcript migration] Exception: u"SON(['
|
||||
'(\'category\', \'asset\'), (\'name\', u\'not_found.srt\'),'
|
||||
' (\'course\', u\'{}\'), (\'tag\', \'c4x\'), (\'org\', u\'{}\'),'
|
||||
' (\'revision\', None)])"'.format(self.course_2.id.course, self.course_2.id.org)),
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
u'[Transcript migration] process for course {} ended. Processed 1 transcripts'.format(
|
||||
unicode(self.course_2.id)
|
||||
)),
|
||||
(LOGGER_NAME,
|
||||
'INFO',
|
||||
"[Transcript migration] Result: Failed: language ge of video test_edx_video_id_2 with exception SON(["
|
||||
"('category', 'asset'), ('name', u'not_found.srt'), ('course', u'{}'),"
|
||||
" ('tag', 'c4x'), ('org', u'{}'), ('revision', None)])".format(
|
||||
self.course_2.id.course, self.course_2.id.org)
|
||||
)
|
||||
)
|
||||
|
||||
with LogCapture(LOGGER_NAME, level=logging.INFO) as logger:
|
||||
call_command('migrate_transcripts', '--course-id', unicode(self.course_2.id), '--commit')
|
||||
logger.check(
|
||||
*expected_log
|
||||
)
|
||||
@@ -13,10 +13,14 @@ from tempfile import NamedTemporaryFile, mkdtemp
|
||||
|
||||
from celery.task import task
|
||||
from celery.utils.log import get_task_logger
|
||||
from celery_utils.chordable_django_backend import chord, chord_task
|
||||
from celery_utils.persist_on_failure import LoggedPersistOnFailureTask
|
||||
from django.conf import settings
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import User
|
||||
from django.core.exceptions import SuspiciousOperation
|
||||
from django.core.files import File
|
||||
from django.core.files.base import ContentFile
|
||||
from django.test import RequestFactory
|
||||
from django.utils.text import get_valid_filename
|
||||
from django.utils.translation import ugettext as _
|
||||
@@ -47,10 +51,244 @@ from xmodule.modulestore.django import modulestore
|
||||
from xmodule.modulestore.exceptions import DuplicateCourseError, ItemNotFoundError
|
||||
from xmodule.modulestore.xml_exporter import export_course_to_xml, export_library_to_xml
|
||||
from xmodule.modulestore.xml_importer import import_course_from_xml, import_library_from_xml
|
||||
from xmodule.video_module.transcripts_utils import (
|
||||
Transcript,
|
||||
clean_video_id,
|
||||
get_transcript_from_contentstore,
|
||||
TranscriptsGenerationException
|
||||
)
|
||||
from xmodule.modulestore import ModuleStoreEnum
|
||||
from xmodule.exceptions import NotFoundError
|
||||
from edxval.api import (
|
||||
ValCannotCreateError,
|
||||
create_video_transcript,
|
||||
is_transcript_available,
|
||||
create_or_update_video_transcript,
|
||||
create_external_video,
|
||||
)
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
LOGGER = get_task_logger(__name__)
|
||||
FILE_READ_CHUNK = 1024 # bytes
|
||||
FULL_COURSE_REINDEX_THRESHOLD = 1
|
||||
DEFAULT_ALL_COURSES = False
|
||||
DEFAULT_FORCE_UPDATE = False
|
||||
DEFAULT_COMMIT = False
|
||||
|
||||
RETRY_DELAY_SECONDS = 30
|
||||
COURSE_LEVEL_TIMEOUT_SECONDS = 1200
|
||||
VIDEO_LEVEL_TIMEOUT_SECONDS = 300
|
||||
|
||||
|
||||
def enqueue_async_migrate_transcripts_tasks(
|
||||
course_keys,
|
||||
force_update=DEFAULT_FORCE_UPDATE,
|
||||
commit=DEFAULT_COMMIT
|
||||
):
|
||||
"""
|
||||
Fires new Celery tasks for all the input courses or for all courses.
|
||||
|
||||
Arguments:
|
||||
course_keys: Command line course ids as list of CourseKey objects,
|
||||
all_courses: Run the command for all courses. Default is False,
|
||||
force_update: Overwrite file in S3. Default is False,
|
||||
commit: Update S3 or dry-run the command to see which transcripts will be affected. Default is False.
|
||||
"""
|
||||
kwargs = {
|
||||
'force_update': force_update,
|
||||
'commit': commit
|
||||
}
|
||||
|
||||
tasks = [
|
||||
async_migrate_transcript.s(
|
||||
unicode(course_key),
|
||||
**kwargs
|
||||
) for course_key in course_keys
|
||||
]
|
||||
callback = task_status_callback.s()
|
||||
status = chord(tasks)(callback)
|
||||
for res in status.get():
|
||||
LOGGER.info("[Transcript migration] Result: %s", '\n'.join(res))
|
||||
|
||||
|
||||
@chord_task
|
||||
def task_status_callback(results):
|
||||
"""
|
||||
Callback for collating the results of chord.
|
||||
"""
|
||||
return results
|
||||
|
||||
|
||||
@chord_task(
|
||||
bind=True,
|
||||
base=LoggedPersistOnFailureTask,
|
||||
default_retry_delay=RETRY_DELAY_SECONDS,
|
||||
max_retries=1,
|
||||
time_limit=COURSE_LEVEL_TIMEOUT_SECONDS
|
||||
)
|
||||
def async_migrate_transcript(self, course_key, **kwargs):
|
||||
#pylint: disable=unused-argument
|
||||
"""
|
||||
Migrates the transcripts of all videos in a course as a new celery task.
|
||||
"""
|
||||
try:
|
||||
if not modulestore().get_course(CourseKey.from_string(course_key)):
|
||||
raise KeyError(u'Invalid course key: ' + unicode(course_key))
|
||||
except KeyError as exc:
|
||||
LOGGER.exception('[Transcript migration] Exception: %r', text_type(exc))
|
||||
return 'Failed: course {course_key} with exception {exception}'.format(
|
||||
course_key=course_key,
|
||||
exception=text_type(exc)
|
||||
)
|
||||
force_update = kwargs['force_update']
|
||||
sub_tasks = []
|
||||
|
||||
all_videos = get_videos_from_store(CourseKey.from_string(course_key))
|
||||
LOGGER.info(
|
||||
"[Transcript migration] process for course %s started. Migrating %s videos",
|
||||
course_key,
|
||||
len(all_videos)
|
||||
)
|
||||
for video in all_videos:
|
||||
all_lang_transcripts = video.transcripts
|
||||
english_transcript = video.sub
|
||||
if english_transcript:
|
||||
all_lang_transcripts.update({'en': video.sub})
|
||||
for lang, _ in all_lang_transcripts.items():
|
||||
transcript_already_present = is_transcript_available(
|
||||
clean_video_id(video.edx_video_id),
|
||||
lang
|
||||
)
|
||||
if transcript_already_present and force_update:
|
||||
sub_tasks.append(async_migrate_transcript_subtask.s(
|
||||
video, lang, True, **kwargs
|
||||
))
|
||||
elif not transcript_already_present:
|
||||
sub_tasks.append(async_migrate_transcript_subtask.s(
|
||||
video, lang, False, **kwargs
|
||||
))
|
||||
LOGGER.info("[Transcript migration] Migrating %s transcripts", len(sub_tasks))
|
||||
callback = task_status_callback.s()
|
||||
status = chord(sub_tasks)(callback)
|
||||
LOGGER.info(
|
||||
"[Transcript migration] process for course %s ended. Processed %s transcripts",
|
||||
course_key,
|
||||
len(status.get())
|
||||
)
|
||||
return status.get()
|
||||
|
||||
|
||||
def get_videos_from_store(course_key):
|
||||
"""
|
||||
Returns all videos in a course as list.
|
||||
|
||||
Arguments:
|
||||
course_key: CourseKey object
|
||||
"""
|
||||
store = modulestore()
|
||||
all_videos = []
|
||||
for video in store.get_items(course_key, qualifiers={'category': 'video'},
|
||||
revision=ModuleStoreEnum.RevisionOption.published_only, include_orphans=False):
|
||||
all_videos.append(video)
|
||||
|
||||
for video in store.get_items(course_key, qualifiers={'category': 'video'},
|
||||
revision=ModuleStoreEnum.RevisionOption.draft_only, include_orphans=False):
|
||||
all_videos.append(video)
|
||||
|
||||
return all_videos
|
||||
|
||||
|
||||
@chord_task(
|
||||
bind=True,
|
||||
base=LoggedPersistOnFailureTask,
|
||||
default_retry_delay=RETRY_DELAY_SECONDS,
|
||||
max_retries=2,
|
||||
time_limit=VIDEO_LEVEL_TIMEOUT_SECONDS
|
||||
)
|
||||
def async_migrate_transcript_subtask(self, *args, **kwargs):
|
||||
#pylint: disable=unused-argument
|
||||
"""
|
||||
Migrates a transcript of a given video in a course as a new celery task.
|
||||
"""
|
||||
video, language_code, force_update = args
|
||||
commit = kwargs['commit']
|
||||
result = None
|
||||
if commit is not True:
|
||||
return 'Language {0} transcript of video {1} will be migrated'.format(
|
||||
language_code,
|
||||
video.edx_video_id
|
||||
)
|
||||
LOGGER.info("[Transcript migration] process for %s transcript started", language_code)
|
||||
try:
|
||||
transcript_info = video.get_transcripts_info()
|
||||
transcript_content, _, _ = get_transcript_from_contentstore(
|
||||
video, language_code, Transcript.SJSON, transcript_info)
|
||||
edx_video_id = clean_video_id(video.edx_video_id)
|
||||
|
||||
if not edx_video_id:
|
||||
video.edx_video_id = create_external_video('external-video')
|
||||
video.save_with_metadata(user=User.objects.get(username='staff'))
|
||||
if edx_video_id:
|
||||
result = save_transcript_to_storage(
|
||||
edx_video_id,
|
||||
language_code,
|
||||
transcript_content,
|
||||
Transcript.SJSON,
|
||||
force_update
|
||||
)
|
||||
except (NotFoundError, TranscriptsGenerationException, ValCannotCreateError) as exc:
|
||||
LOGGER.exception('[Transcript migration] Exception: %r', text_type(exc))
|
||||
return 'Failed: language {language} of video {video} with exception {exception}'.format(
|
||||
language=language_code,
|
||||
video=video.edx_video_id,
|
||||
exception=text_type(exc)
|
||||
)
|
||||
LOGGER.info("[Transcript migration] process for %s transcript ended", language_code)
|
||||
if result is not None:
|
||||
return 'Success: language {0} of video {1}'.format(language_code, video.edx_video_id)
|
||||
else:
|
||||
return 'Failed: language {0} of video {1}'.format(language_code, video.edx_video_id)
|
||||
|
||||
|
||||
def save_transcript_to_storage(
|
||||
edx_video_id,
|
||||
language_code,
|
||||
transcript_content,
|
||||
file_format=Transcript.SJSON,
|
||||
force_update=False
|
||||
):
|
||||
"""
|
||||
Pushes a given transcript's data to django storage.
|
||||
"""
|
||||
try:
|
||||
result = None
|
||||
edx_video_id = clean_video_id(edx_video_id)
|
||||
if force_update:
|
||||
result = create_or_update_video_transcript(
|
||||
edx_video_id,
|
||||
language_code,
|
||||
dict({'file_format': file_format}),
|
||||
ContentFile(transcript_content)
|
||||
)
|
||||
LOGGER.info("[Transcript migration] save_transcript_to_storage %s for %s with create_or_update method",
|
||||
True if result else False, edx_video_id)
|
||||
else:
|
||||
result = create_video_transcript(
|
||||
edx_video_id,
|
||||
language_code,
|
||||
file_format,
|
||||
ContentFile(transcript_content)
|
||||
)
|
||||
LOGGER.info(
|
||||
"[Transcript migration] save_transcript_to_storage %s for %s with create method",
|
||||
result,
|
||||
edx_video_id
|
||||
)
|
||||
return result
|
||||
except ValCannotCreateError as err:
|
||||
LOGGER.exception("[Transcript migration] save_transcript_to_storage_failed: %s", err)
|
||||
raise
|
||||
|
||||
|
||||
def clone_instance(instance, field_values):
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
('video_config', '0002_coursevideotranscriptenabledflag_videotranscriptenabledflag'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='TranscriptMigrationSetting',
|
||||
fields=[
|
||||
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
|
||||
('change_date', models.DateTimeField(auto_now_add=True, verbose_name='Change date')),
|
||||
('enabled', models.BooleanField(default=False, verbose_name='Enabled')),
|
||||
('force_update', models.BooleanField(default=False, help_text=b'Flag to force migrate transcripts for the requested courses, overwrite if already present.')),
|
||||
('commit', models.BooleanField(default=False, help_text=b'Dry-run or commit.')),
|
||||
('all_courses', models.BooleanField(default=False, help_text=b'Process all courses.')),
|
||||
('course_ids', models.TextField(help_text=b'Whitespace-separated list of course keys for which to migrate transcripts.')),
|
||||
('changed_by', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, editable=False, to=settings.AUTH_USER_MODEL, null=True, verbose_name='Changed by')),
|
||||
],
|
||||
options={
|
||||
'ordering': ('-change_date',),
|
||||
'abstract': False,
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -2,7 +2,7 @@
|
||||
Configuration models for Video XModule
|
||||
"""
|
||||
from config_models.models import ConfigurationModel
|
||||
from django.db.models import BooleanField
|
||||
from django.db.models import BooleanField, TextField
|
||||
from opaque_keys.edx.django.models import CourseKeyField
|
||||
|
||||
|
||||
@@ -130,3 +130,34 @@ class CourseVideoTranscriptEnabledFlag(ConfigurationModel):
|
||||
course_key=unicode(self.course_id),
|
||||
not_enabled=not_en
|
||||
)
|
||||
|
||||
|
||||
class TranscriptMigrationSetting(ConfigurationModel):
|
||||
"""
|
||||
Arguments for the Transcript Migration management command
|
||||
"""
|
||||
def __unicode__(self):
|
||||
return (
|
||||
"[TranscriptMigrationSetting] Courses {courses} with update if already present as {force}"
|
||||
" and commit as {commit}"
|
||||
).format(
|
||||
courses='ALL' if self.all_courses else self.course_ids,
|
||||
force=self.force_update,
|
||||
commit=self.commit
|
||||
)
|
||||
force_update = BooleanField(
|
||||
default=False,
|
||||
help_text="Flag to force migrate transcripts for the requested courses, overwrite if already present."
|
||||
)
|
||||
commit = BooleanField(
|
||||
default=False,
|
||||
help_text="Dry-run or commit."
|
||||
)
|
||||
all_courses = BooleanField(
|
||||
default=False,
|
||||
help_text="Process all courses."
|
||||
)
|
||||
course_ids = TextField(
|
||||
blank=False,
|
||||
help_text="Whitespace-separated list of course keys for which to migrate transcripts."
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user