feat: update youtube transcript fetch to allow all languages (#34436)

* feat: allow all languages

* feat: add youtube transcript import functions as drf
This commit is contained in:
Kristin Aoki
2024-03-29 08:28:18 -04:00
committed by GitHub
parent 25409de9d1
commit 9f734a7a5e
11 changed files with 158 additions and 38 deletions

View File

@@ -4,5 +4,5 @@ Serializers for v0 contentstore API.
from .advanced_settings import AdvancedSettingsFieldSerializer, CourseAdvancedSettingsSerializer
from .assets import AssetSerializer
from .tabs import CourseTabSerializer, CourseTabUpdateSerializer, TabIDLocatorSerializer
from .transcripts import TranscriptSerializer
from .transcripts import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer
from .xblock import XblockSerializer

View File

@@ -13,3 +13,28 @@ class TranscriptSerializer(StrictSerializer):
edx_video_id = serializers.CharField()
language_code = serializers.CharField(required=False, allow_null=True)
new_language_code = serializers.CharField(required=False, allow_null=True)
class YoutubeTranscriptCheckSerializer(StrictSerializer):
"""
Strict Serializer for YouTube transcripts check
"""
html5_local = serializers.ListField(
child=serializers.CharField()
)
html5_equal = serializers.BooleanField()
is_youtube_mode = serializers.BooleanField()
youtube_local = serializers.BooleanField()
youtube_server = serializers.BooleanField()
youtube_diff = serializers.BooleanField()
current_item_subs = serializers.ListField(required=False, allow_null=True)
status = serializers.CharField()
command = serializers.CharField()
class YoutubeTranscriptUploadSerializer(StrictSerializer):
"""
Strict Serializer for YouTube transcripts upload
"""
edx_video_id = serializers.CharField()
status = serializers.CharField()

View File

@@ -5,9 +5,16 @@ from django.urls import re_path, path
from openedx.core.constants import COURSE_ID_PATTERN
from .views import AdvancedCourseSettingsView, CourseTabSettingsView, CourseTabListView, CourseTabReorderView
from .views import (
AdvancedCourseSettingsView,
CourseTabSettingsView,
CourseTabListView,
CourseTabReorderView,
TranscriptView,
YoutubeTranscriptCheckView,
YoutubeTranscriptUploadView,
)
from .views import assets
from .views import transcripts
from .views import authoring_videos
from .views import xblock
@@ -68,7 +75,7 @@ urlpatterns = [
),
re_path(
fr'^video_transcripts/{settings.COURSE_ID_PATTERN}$',
transcripts.TranscriptView.as_view(), name='cms_api_video_transcripts'
TranscriptView.as_view(), name='cms_api_video_transcripts'
),
re_path(
fr'^xblock/{settings.COURSE_ID_PATTERN}$',
@@ -78,4 +85,12 @@ urlpatterns = [
fr'^xblock/{settings.COURSE_ID_PATTERN}/{settings.USAGE_KEY_PATTERN}$',
xblock.XblockView.as_view(), name='cms_api_xblock'
),
re_path(
fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/check?$',
YoutubeTranscriptCheckView.as_view(), name='cms_api_youtube_transcripts_check'
),
re_path(
fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/upload?$',
YoutubeTranscriptUploadView.as_view(), name='cms_api_youtube_transcripts_upload'
),
]

View File

@@ -3,3 +3,4 @@ Views for v0 contentstore API.
"""
from .advanced_settings import AdvancedCourseSettingsView
from .tabs import CourseTabSettingsView, CourseTabListView, CourseTabReorderView
from .transcripts import TranscriptView, YoutubeTranscriptCheckView, YoutubeTranscriptUploadView

View File

@@ -14,14 +14,14 @@ from openedx.core.lib.api.view_utils import DeveloperErrorViewMixin, view_auth_c
from common.djangoapps.util.json_request import expect_json_in_class_view
from cms.djangoapps.contentstore.api import course_author_access_required
from cms.djangoapps.contentstore.views.transcripts_ajax import check_transcripts, replace_transcripts
from cms.djangoapps.contentstore.transcript_storage_handlers import (
upload_transcript,
delete_video_transcript_or_404,
handle_transcript_download,
)
import cms.djangoapps.contentstore.toggles as contentstore_toggles
from ..serializers import TranscriptSerializer
from ..serializers import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer
from rest_framework.parsers import (MultiPartParser, FormParser)
from openedx.core.lib.api.parsers import TypedFileUploadParser
@@ -68,3 +68,50 @@ class TranscriptView(DeveloperErrorViewMixin, CreateAPIView, RetrieveAPIView, De
"""
return delete_video_transcript_or_404(request)
@view_auth_classes()
class YoutubeTranscriptCheckView(DeveloperErrorViewMixin, RetrieveAPIView):
"""
public rest API endpoints for the CMS API YouTube transcripts.
youtube_id: required argument, needed to authorize course authors and identify the video.
edx_video_id: required argument, needed to identify the transcript.
xblock_id: required argument, needed to identify the transcript.
"""
serializer_class = YoutubeTranscriptCheckSerializer
parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser)
def dispatch(self, request, *args, **kwargs):
if not toggles.use_studio_content_api():
raise Http404
return super().dispatch(request, *args, **kwargs)
@course_author_access_required
def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ
"""
Get the status of youtube transcripts for a given youtube video
"""
return check_transcripts(request)
@view_auth_classes()
class YoutubeTranscriptUploadView(DeveloperErrorViewMixin, RetrieveAPIView):
"""
public rest API endpoints for the CMS API YouTube transcripts.
youtube_id: required argument, needed to authorize course authors and identify the video.
xblock_id: required argument, needed to identify the transcript.
"""
serializer_class = YoutubeTranscriptUploadSerializer
parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser)
def dispatch(self, request, *args, **kwargs):
if not toggles.use_studio_content_api():
raise Http404
return super().dispatch(request, *args, **kwargs)
@course_author_access_required
def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ
"""
Get the youtube transcripts for a give youtube video and add them to video block
"""
return replace_transcripts(request)

View File

@@ -464,14 +464,16 @@ class TestYoutubeTranscripts(unittest.TestCase):
setup_caption_responses(mock_get, 'en', 'test', track_status_code)
youtube_id = 'bad_youtube_id'
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation)
@patch('xmodule.video_block.transcripts_utils.requests.get')
def test_youtube_empty_text(self, mock_get):
setup_caption_responses(mock_get, 'en', '')
youtube_id = 'bad_youtube_id'
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation)
def test_youtube_good_result(self):
caption_response_string = textwrap.dedent("""<?xml version="1.0" encoding="utf-8" ?>
@@ -491,7 +493,8 @@ class TestYoutubeTranscripts(unittest.TestCase):
language_code = 'en'
with patch('xmodule.video_block.transcripts_utils.requests.get') as mock_get:
setup_caption_responses(mock_get, language_code, caption_response_string)
transcripts = transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
transcripts = transcripts_utils.get_transcript_from_youtube(link['en'], youtube_id, translation)
self.assertEqual(transcripts, expected_transcripts)
self.assertEqual(2, len(mock_get.mock_calls))

View File

@@ -628,7 +628,7 @@ class TestRenameTranscripts(BaseTranscripts):
@ddt.ddt
@patch(
'cms.djangoapps.contentstore.views.transcripts_ajax.download_youtube_subs',
Mock(return_value=SJSON_TRANSCRIPT_CONTENT)
Mock(return_value=[['en', SJSON_TRANSCRIPT_CONTENT]])
)
class TestReplaceTranscripts(BaseTranscripts):
"""

View File

@@ -39,8 +39,9 @@ from xmodule.video_block.transcripts_utils import ( # lint-amnesty, pylint: dis
get_transcript,
get_transcript_for_video,
get_transcript_from_val,
get_transcripts_from_youtube,
get_transcript_link_from_youtube
get_transcript_from_youtube,
get_transcript_link_from_youtube,
get_transcript_links_from_youtube,
)
__all__ = [
@@ -345,13 +346,17 @@ def check_transcripts(request): # lint-amnesty, pylint: disable=too-many-statem
#check youtube local and server transcripts for equality
if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']:
try:
youtube_server_subs = get_transcripts_from_youtube(
transcript_links = get_transcript_links_from_youtube(
youtube_id,
settings,
item.runtime.service(item, "i18n")
)
if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality
transcripts_presence['youtube_diff'] = False
for (_, link) in transcript_links.items():
youtube_server_subs = get_transcript_from_youtube(
link, youtube_id, item.runtime.service(item, "i18n")
)
if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality
transcripts_presence['youtube_diff'] = False
except GetTranscriptsFromYouTubeException:
pass
@@ -450,7 +455,6 @@ def _validate_transcripts_data(request):
data = json.loads(request.GET.get('data', '{}'))
if not data:
raise TranscriptsRequestValidationException(_('Incoming video data is empty.'))
try:
item = _get_item(request, data)
except (InvalidKeyError, ItemNotFoundError):
@@ -512,7 +516,6 @@ def validate_transcripts_request(request, include_yt=False, include_html5=False)
for video in videos
if video['type'] != 'youtube'
}
return error, validated_data
@@ -622,8 +625,13 @@ def replace_transcripts(request):
# 2. Link a video to video component if its not already linked to one.
edx_video_id = link_video_to_component(video, request.user)
# for transcript in transcript_links:
# 3. Upload YT transcript to DS for the linked video ID.
success = save_video_transcript(edx_video_id, Transcript.SJSON, transcript_content, language_code='en')
success = True
for transcript in transcript_content:
[language_code, json_content] = transcript
success = save_video_transcript(edx_video_id, Transcript.SJSON, json_content, language_code)
if success:
response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200)
else:

View File

@@ -15,7 +15,8 @@ def cms_api_filter(endpoints):
path.startswith("/api/contentstore/v0/xblock") or
path.startswith("/api/contentstore/v0/videos") or
path.startswith("/api/contentstore/v0/video_transcripts") or
path.startswith("/api/contentstore/v0/file_assets")
path.startswith("/api/contentstore/v0/file_assets") or
path.startswith("/api/contentstore/v0/youtube_transcripts")
):
filtered.append((path, path_regex, method, callback))
return filtered

View File

@@ -110,21 +110,24 @@ class TranscriptsUtilsTest(TestCase):
"""
@mock.patch('requests.get')
@ddt.data("en", "en-US", "en-GB")
@ddt.data("en", "en-US", "en-GB", 'fr')
def test_get_transcript_link_from_youtube(self, language_code, mock_get):
"""
Happy path test: english caption link returned when video page HTML has one english caption
Happy path test: dict of caption links returned when video page HTML has at least one caption
"""
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
language_specific_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
self.assertEqual(language_specific_caption_link, CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code))
self.assertEqual(
language_specific_caption_link,
{language_code: CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code)}
)
@ mock.patch('requests.get')
@ddt.data("fr", None)
def test_get_caption_no_english_caption(self, language_code, mock_get):
@ddt.data(None)
def test_get_caption_no_caption(self, language_code, mock_get):
"""
No caption link returned when video page HTML contains no caption in English
No caption link returned when video page HTML contains no caption
"""
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)

View File

@@ -180,19 +180,22 @@ def get_transcript_link_from_youtube(youtube_id):
try:
youtube_html = requests.get(f"{youtube_url_base}{youtube_id}")
caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX']
allowed_language_codes = settings.YOUTUBE['TRANSCRIPTS']['ALLOWED_LANGUAGE_CODES']
caption_matched = re.search(caption_re, youtube_html.content.decode("utf-8"))
if caption_matched:
caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]')
caption_links = {}
for caption in caption_tracks:
if "languageCode" in caption.keys() and caption["languageCode"] in allowed_language_codes:
return caption.get("baseUrl")
language_code = caption.get('languageCode', None)
if language_code and not language_code == 'None':
link = caption.get("baseUrl")
caption_links[language_code] = link
return None if not caption_links else caption_links
return None
except ConnectionError:
return None
def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name
def get_transcript_links_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name
"""
Gets transcripts from youtube for youtube_id.
@@ -202,18 +205,29 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_
Returns (status, transcripts): bool, dict.
"""
_ = i18n.gettext
transcript_links = get_transcript_link_from_youtube(youtube_id)
utf8_parser = etree.XMLParser(encoding='utf-8')
transcript_link = get_transcript_link_from_youtube(youtube_id)
if not transcript_link:
if not transcript_links:
msg = _("Can't get transcript link from Youtube for {youtube_id}.").format(
youtube_id=youtube_id,
)
raise GetTranscriptsFromYouTubeException(msg)
data = requests.get(transcript_link)
return transcript_links
def get_transcript_from_youtube(link, youtube_id, i18n):
"""
Gets transcripts from youtube for youtube_id.
Parses only utf-8 encoded transcripts.
Other encodings are not supported at the moment.
Returns (status, transcripts): bool, dict.
"""
_ = i18n.gettext
utf8_parser = etree.XMLParser(encoding='utf-8')
data = requests.get(link)
if data.status_code != 200 or not data.text:
msg = _("Can't receive transcripts from Youtube for {youtube_id}. Status code: {status_code}.").format(
@@ -258,9 +272,12 @@ def download_youtube_subs(youtube_id, video_block, settings): # lint-amnesty, p
"""
i18n = video_block.runtime.service(video_block, "i18n")
_ = i18n.gettext
subs = get_transcripts_from_youtube(youtube_id, settings, i18n)
return json.dumps(subs, indent=2)
transcript_links = get_transcript_links_from_youtube(youtube_id, settings, i18n)
subs = []
for (language_code, link) in transcript_links.items():
sub = get_transcript_from_youtube(link, youtube_id, i18n)
subs.append([language_code, json.dumps(sub, indent=2)])
return subs
def remove_subs_from_store(subs_id, item, lang='en'):