From 9f734a7a5efe734ce97e20ebedb5c8abad7c1ebb Mon Sep 17 00:00:00 2001 From: Kristin Aoki <42981026+KristinAoki@users.noreply.github.com> Date: Fri, 29 Mar 2024 08:28:18 -0400 Subject: [PATCH] feat: update youtube transcript fetch to allow all languages (#34436) * feat: allow all languages * feat: add youtube transcript import functions as drf --- .../rest_api/v0/serializers/__init__.py | 2 +- .../rest_api/v0/serializers/transcripts.py | 25 +++++++++ .../contentstore/rest_api/v0/urls.py | 21 ++++++-- .../rest_api/v0/views/__init__.py | 1 + .../rest_api/v0/views/transcripts.py | 51 ++++++++++++++++++- .../tests/test_transcripts_utils.py | 9 ++-- .../views/tests/test_transcripts.py | 2 +- .../contentstore/views/transcripts_ajax.py | 24 ++++++--- cms/lib/spectacular.py | 3 +- xmodule/tests/test_transcripts_utils.py | 15 +++--- xmodule/video_block/transcripts_utils.py | 43 +++++++++++----- 11 files changed, 158 insertions(+), 38 deletions(-) diff --git a/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py b/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py index 4ca1f5f05a..e90409ca86 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py +++ b/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py @@ -4,5 +4,5 @@ Serializers for v0 contentstore API. from .advanced_settings import AdvancedSettingsFieldSerializer, CourseAdvancedSettingsSerializer from .assets import AssetSerializer from .tabs import CourseTabSerializer, CourseTabUpdateSerializer, TabIDLocatorSerializer -from .transcripts import TranscriptSerializer +from .transcripts import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer from .xblock import XblockSerializer diff --git a/cms/djangoapps/contentstore/rest_api/v0/serializers/transcripts.py b/cms/djangoapps/contentstore/rest_api/v0/serializers/transcripts.py index bf6ea1d9f3..d0cb92e906 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/serializers/transcripts.py +++ b/cms/djangoapps/contentstore/rest_api/v0/serializers/transcripts.py @@ -13,3 +13,28 @@ class TranscriptSerializer(StrictSerializer): edx_video_id = serializers.CharField() language_code = serializers.CharField(required=False, allow_null=True) new_language_code = serializers.CharField(required=False, allow_null=True) + + +class YoutubeTranscriptCheckSerializer(StrictSerializer): + """ + Strict Serializer for YouTube transcripts check + """ + html5_local = serializers.ListField( + child=serializers.CharField() + ) + html5_equal = serializers.BooleanField() + is_youtube_mode = serializers.BooleanField() + youtube_local = serializers.BooleanField() + youtube_server = serializers.BooleanField() + youtube_diff = serializers.BooleanField() + current_item_subs = serializers.ListField(required=False, allow_null=True) + status = serializers.CharField() + command = serializers.CharField() + + +class YoutubeTranscriptUploadSerializer(StrictSerializer): + """ + Strict Serializer for YouTube transcripts upload + """ + edx_video_id = serializers.CharField() + status = serializers.CharField() diff --git a/cms/djangoapps/contentstore/rest_api/v0/urls.py b/cms/djangoapps/contentstore/rest_api/v0/urls.py index 9d3f872f70..2a7e400099 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/urls.py +++ b/cms/djangoapps/contentstore/rest_api/v0/urls.py @@ -5,9 +5,16 @@ from django.urls import re_path, path from openedx.core.constants import COURSE_ID_PATTERN -from .views import AdvancedCourseSettingsView, CourseTabSettingsView, CourseTabListView, CourseTabReorderView +from .views import ( + AdvancedCourseSettingsView, + CourseTabSettingsView, + CourseTabListView, + CourseTabReorderView, + TranscriptView, + YoutubeTranscriptCheckView, + YoutubeTranscriptUploadView, +) from .views import assets -from .views import transcripts from .views import authoring_videos from .views import xblock @@ -68,7 +75,7 @@ urlpatterns = [ ), re_path( fr'^video_transcripts/{settings.COURSE_ID_PATTERN}$', - transcripts.TranscriptView.as_view(), name='cms_api_video_transcripts' + TranscriptView.as_view(), name='cms_api_video_transcripts' ), re_path( fr'^xblock/{settings.COURSE_ID_PATTERN}$', @@ -78,4 +85,12 @@ urlpatterns = [ fr'^xblock/{settings.COURSE_ID_PATTERN}/{settings.USAGE_KEY_PATTERN}$', xblock.XblockView.as_view(), name='cms_api_xblock' ), + re_path( + fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/check?$', + YoutubeTranscriptCheckView.as_view(), name='cms_api_youtube_transcripts_check' + ), + re_path( + fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/upload?$', + YoutubeTranscriptUploadView.as_view(), name='cms_api_youtube_transcripts_upload' + ), ] diff --git a/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py b/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py index 91f4388cd8..b6403d53e3 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py +++ b/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py @@ -3,3 +3,4 @@ Views for v0 contentstore API. """ from .advanced_settings import AdvancedCourseSettingsView from .tabs import CourseTabSettingsView, CourseTabListView, CourseTabReorderView +from .transcripts import TranscriptView, YoutubeTranscriptCheckView, YoutubeTranscriptUploadView diff --git a/cms/djangoapps/contentstore/rest_api/v0/views/transcripts.py b/cms/djangoapps/contentstore/rest_api/v0/views/transcripts.py index c97261c429..9a63693e12 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/views/transcripts.py +++ b/cms/djangoapps/contentstore/rest_api/v0/views/transcripts.py @@ -14,14 +14,14 @@ from openedx.core.lib.api.view_utils import DeveloperErrorViewMixin, view_auth_c from common.djangoapps.util.json_request import expect_json_in_class_view from cms.djangoapps.contentstore.api import course_author_access_required - +from cms.djangoapps.contentstore.views.transcripts_ajax import check_transcripts, replace_transcripts from cms.djangoapps.contentstore.transcript_storage_handlers import ( upload_transcript, delete_video_transcript_or_404, handle_transcript_download, ) import cms.djangoapps.contentstore.toggles as contentstore_toggles -from ..serializers import TranscriptSerializer +from ..serializers import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer from rest_framework.parsers import (MultiPartParser, FormParser) from openedx.core.lib.api.parsers import TypedFileUploadParser @@ -68,3 +68,50 @@ class TranscriptView(DeveloperErrorViewMixin, CreateAPIView, RetrieveAPIView, De """ return delete_video_transcript_or_404(request) + + +@view_auth_classes() +class YoutubeTranscriptCheckView(DeveloperErrorViewMixin, RetrieveAPIView): + """ + public rest API endpoints for the CMS API YouTube transcripts. + youtube_id: required argument, needed to authorize course authors and identify the video. + edx_video_id: required argument, needed to identify the transcript. + xblock_id: required argument, needed to identify the transcript. + """ + serializer_class = YoutubeTranscriptCheckSerializer + parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser) + + def dispatch(self, request, *args, **kwargs): + if not toggles.use_studio_content_api(): + raise Http404 + return super().dispatch(request, *args, **kwargs) + + @course_author_access_required + def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ + """ + Get the status of youtube transcripts for a given youtube video + """ + return check_transcripts(request) + + +@view_auth_classes() +class YoutubeTranscriptUploadView(DeveloperErrorViewMixin, RetrieveAPIView): + """ + public rest API endpoints for the CMS API YouTube transcripts. + youtube_id: required argument, needed to authorize course authors and identify the video. + xblock_id: required argument, needed to identify the transcript. + """ + serializer_class = YoutubeTranscriptUploadSerializer + parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser) + + def dispatch(self, request, *args, **kwargs): + if not toggles.use_studio_content_api(): + raise Http404 + return super().dispatch(request, *args, **kwargs) + + @course_author_access_required + def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ + """ + Get the youtube transcripts for a give youtube video and add them to video block + """ + return replace_transcripts(request) diff --git a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py index c43a6be649..15a17faa0f 100644 --- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py +++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py @@ -464,14 +464,16 @@ class TestYoutubeTranscripts(unittest.TestCase): setup_caption_responses(mock_get, 'en', 'test', track_status_code) youtube_id = 'bad_youtube_id' with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException): - transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation) + link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation) + transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation) @patch('xmodule.video_block.transcripts_utils.requests.get') def test_youtube_empty_text(self, mock_get): setup_caption_responses(mock_get, 'en', '') youtube_id = 'bad_youtube_id' with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException): - transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation) + link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation) + transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation) def test_youtube_good_result(self): caption_response_string = textwrap.dedent(""" @@ -491,7 +493,8 @@ class TestYoutubeTranscripts(unittest.TestCase): language_code = 'en' with patch('xmodule.video_block.transcripts_utils.requests.get') as mock_get: setup_caption_responses(mock_get, language_code, caption_response_string) - transcripts = transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation) + link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation) + transcripts = transcripts_utils.get_transcript_from_youtube(link['en'], youtube_id, translation) self.assertEqual(transcripts, expected_transcripts) self.assertEqual(2, len(mock_get.mock_calls)) diff --git a/cms/djangoapps/contentstore/views/tests/test_transcripts.py b/cms/djangoapps/contentstore/views/tests/test_transcripts.py index 619cf6b775..95fbaccbab 100644 --- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py +++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py @@ -628,7 +628,7 @@ class TestRenameTranscripts(BaseTranscripts): @ddt.ddt @patch( 'cms.djangoapps.contentstore.views.transcripts_ajax.download_youtube_subs', - Mock(return_value=SJSON_TRANSCRIPT_CONTENT) + Mock(return_value=[['en', SJSON_TRANSCRIPT_CONTENT]]) ) class TestReplaceTranscripts(BaseTranscripts): """ diff --git a/cms/djangoapps/contentstore/views/transcripts_ajax.py b/cms/djangoapps/contentstore/views/transcripts_ajax.py index 61ea4da615..892b76caae 100644 --- a/cms/djangoapps/contentstore/views/transcripts_ajax.py +++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py @@ -39,8 +39,9 @@ from xmodule.video_block.transcripts_utils import ( # lint-amnesty, pylint: dis get_transcript, get_transcript_for_video, get_transcript_from_val, - get_transcripts_from_youtube, - get_transcript_link_from_youtube + get_transcript_from_youtube, + get_transcript_link_from_youtube, + get_transcript_links_from_youtube, ) __all__ = [ @@ -345,13 +346,17 @@ def check_transcripts(request): # lint-amnesty, pylint: disable=too-many-statem #check youtube local and server transcripts for equality if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']: try: - youtube_server_subs = get_transcripts_from_youtube( + transcript_links = get_transcript_links_from_youtube( youtube_id, settings, item.runtime.service(item, "i18n") ) - if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality - transcripts_presence['youtube_diff'] = False + for (_, link) in transcript_links.items(): + youtube_server_subs = get_transcript_from_youtube( + link, youtube_id, item.runtime.service(item, "i18n") + ) + if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality + transcripts_presence['youtube_diff'] = False except GetTranscriptsFromYouTubeException: pass @@ -450,7 +455,6 @@ def _validate_transcripts_data(request): data = json.loads(request.GET.get('data', '{}')) if not data: raise TranscriptsRequestValidationException(_('Incoming video data is empty.')) - try: item = _get_item(request, data) except (InvalidKeyError, ItemNotFoundError): @@ -512,7 +516,6 @@ def validate_transcripts_request(request, include_yt=False, include_html5=False) for video in videos if video['type'] != 'youtube' } - return error, validated_data @@ -622,8 +625,13 @@ def replace_transcripts(request): # 2. Link a video to video component if its not already linked to one. edx_video_id = link_video_to_component(video, request.user) + # for transcript in transcript_links: + # 3. Upload YT transcript to DS for the linked video ID. - success = save_video_transcript(edx_video_id, Transcript.SJSON, transcript_content, language_code='en') + success = True + for transcript in transcript_content: + [language_code, json_content] = transcript + success = save_video_transcript(edx_video_id, Transcript.SJSON, json_content, language_code) if success: response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200) else: diff --git a/cms/lib/spectacular.py b/cms/lib/spectacular.py index 4318269beb..c7445decc2 100644 --- a/cms/lib/spectacular.py +++ b/cms/lib/spectacular.py @@ -15,7 +15,8 @@ def cms_api_filter(endpoints): path.startswith("/api/contentstore/v0/xblock") or path.startswith("/api/contentstore/v0/videos") or path.startswith("/api/contentstore/v0/video_transcripts") or - path.startswith("/api/contentstore/v0/file_assets") + path.startswith("/api/contentstore/v0/file_assets") or + path.startswith("/api/contentstore/v0/youtube_transcripts") ): filtered.append((path, path_regex, method, callback)) return filtered diff --git a/xmodule/tests/test_transcripts_utils.py b/xmodule/tests/test_transcripts_utils.py index 4f33169b51..bb4141f06c 100644 --- a/xmodule/tests/test_transcripts_utils.py +++ b/xmodule/tests/test_transcripts_utils.py @@ -110,21 +110,24 @@ class TranscriptsUtilsTest(TestCase): """ @mock.patch('requests.get') - @ddt.data("en", "en-US", "en-GB") + @ddt.data("en", "en-US", "en-GB", 'fr') def test_get_transcript_link_from_youtube(self, language_code, mock_get): """ - Happy path test: english caption link returned when video page HTML has one english caption + Happy path test: dict of caption links returned when video page HTML has at least one caption """ mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code) language_specific_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID) - self.assertEqual(language_specific_caption_link, CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code)) + self.assertEqual( + language_specific_caption_link, + {language_code: CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code)} + ) @ mock.patch('requests.get') - @ddt.data("fr", None) - def test_get_caption_no_english_caption(self, language_code, mock_get): + @ddt.data(None) + def test_get_caption_no_caption(self, language_code, mock_get): """ - No caption link returned when video page HTML contains no caption in English + No caption link returned when video page HTML contains no caption """ mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code) diff --git a/xmodule/video_block/transcripts_utils.py b/xmodule/video_block/transcripts_utils.py index d81ce2ab08..3dd88c2f31 100644 --- a/xmodule/video_block/transcripts_utils.py +++ b/xmodule/video_block/transcripts_utils.py @@ -180,19 +180,22 @@ def get_transcript_link_from_youtube(youtube_id): try: youtube_html = requests.get(f"{youtube_url_base}{youtube_id}") caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX'] - allowed_language_codes = settings.YOUTUBE['TRANSCRIPTS']['ALLOWED_LANGUAGE_CODES'] caption_matched = re.search(caption_re, youtube_html.content.decode("utf-8")) if caption_matched: caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]') + caption_links = {} for caption in caption_tracks: - if "languageCode" in caption.keys() and caption["languageCode"] in allowed_language_codes: - return caption.get("baseUrl") + language_code = caption.get('languageCode', None) + if language_code and not language_code == 'None': + link = caption.get("baseUrl") + caption_links[language_code] = link + return None if not caption_links else caption_links return None except ConnectionError: return None -def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name +def get_transcript_links_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name """ Gets transcripts from youtube for youtube_id. @@ -202,18 +205,29 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_ Returns (status, transcripts): bool, dict. """ _ = i18n.gettext + transcript_links = get_transcript_link_from_youtube(youtube_id) - utf8_parser = etree.XMLParser(encoding='utf-8') - - transcript_link = get_transcript_link_from_youtube(youtube_id) - - if not transcript_link: + if not transcript_links: msg = _("Can't get transcript link from Youtube for {youtube_id}.").format( youtube_id=youtube_id, ) raise GetTranscriptsFromYouTubeException(msg) - data = requests.get(transcript_link) + return transcript_links + + +def get_transcript_from_youtube(link, youtube_id, i18n): + """ + Gets transcripts from youtube for youtube_id. + + Parses only utf-8 encoded transcripts. + Other encodings are not supported at the moment. + + Returns (status, transcripts): bool, dict. + """ + _ = i18n.gettext + utf8_parser = etree.XMLParser(encoding='utf-8') + data = requests.get(link) if data.status_code != 200 or not data.text: msg = _("Can't receive transcripts from Youtube for {youtube_id}. Status code: {status_code}.").format( @@ -258,9 +272,12 @@ def download_youtube_subs(youtube_id, video_block, settings): # lint-amnesty, p """ i18n = video_block.runtime.service(video_block, "i18n") _ = i18n.gettext - - subs = get_transcripts_from_youtube(youtube_id, settings, i18n) - return json.dumps(subs, indent=2) + transcript_links = get_transcript_links_from_youtube(youtube_id, settings, i18n) + subs = [] + for (language_code, link) in transcript_links.items(): + sub = get_transcript_from_youtube(link, youtube_id, i18n) + subs.append([language_code, json.dumps(sub, indent=2)]) + return subs def remove_subs_from_store(subs_id, item, lang='en'):