feat: update youtube transcript fetch to allow all languages (#34436)
* feat: allow all languages * feat: add youtube transcript import functions as drf
This commit is contained in:
@@ -4,5 +4,5 @@ Serializers for v0 contentstore API.
|
||||
from .advanced_settings import AdvancedSettingsFieldSerializer, CourseAdvancedSettingsSerializer
|
||||
from .assets import AssetSerializer
|
||||
from .tabs import CourseTabSerializer, CourseTabUpdateSerializer, TabIDLocatorSerializer
|
||||
from .transcripts import TranscriptSerializer
|
||||
from .transcripts import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer
|
||||
from .xblock import XblockSerializer
|
||||
|
||||
@@ -13,3 +13,28 @@ class TranscriptSerializer(StrictSerializer):
|
||||
edx_video_id = serializers.CharField()
|
||||
language_code = serializers.CharField(required=False, allow_null=True)
|
||||
new_language_code = serializers.CharField(required=False, allow_null=True)
|
||||
|
||||
|
||||
class YoutubeTranscriptCheckSerializer(StrictSerializer):
|
||||
"""
|
||||
Strict Serializer for YouTube transcripts check
|
||||
"""
|
||||
html5_local = serializers.ListField(
|
||||
child=serializers.CharField()
|
||||
)
|
||||
html5_equal = serializers.BooleanField()
|
||||
is_youtube_mode = serializers.BooleanField()
|
||||
youtube_local = serializers.BooleanField()
|
||||
youtube_server = serializers.BooleanField()
|
||||
youtube_diff = serializers.BooleanField()
|
||||
current_item_subs = serializers.ListField(required=False, allow_null=True)
|
||||
status = serializers.CharField()
|
||||
command = serializers.CharField()
|
||||
|
||||
|
||||
class YoutubeTranscriptUploadSerializer(StrictSerializer):
|
||||
"""
|
||||
Strict Serializer for YouTube transcripts upload
|
||||
"""
|
||||
edx_video_id = serializers.CharField()
|
||||
status = serializers.CharField()
|
||||
|
||||
@@ -5,9 +5,16 @@ from django.urls import re_path, path
|
||||
|
||||
from openedx.core.constants import COURSE_ID_PATTERN
|
||||
|
||||
from .views import AdvancedCourseSettingsView, CourseTabSettingsView, CourseTabListView, CourseTabReorderView
|
||||
from .views import (
|
||||
AdvancedCourseSettingsView,
|
||||
CourseTabSettingsView,
|
||||
CourseTabListView,
|
||||
CourseTabReorderView,
|
||||
TranscriptView,
|
||||
YoutubeTranscriptCheckView,
|
||||
YoutubeTranscriptUploadView,
|
||||
)
|
||||
from .views import assets
|
||||
from .views import transcripts
|
||||
from .views import authoring_videos
|
||||
from .views import xblock
|
||||
|
||||
@@ -68,7 +75,7 @@ urlpatterns = [
|
||||
),
|
||||
re_path(
|
||||
fr'^video_transcripts/{settings.COURSE_ID_PATTERN}$',
|
||||
transcripts.TranscriptView.as_view(), name='cms_api_video_transcripts'
|
||||
TranscriptView.as_view(), name='cms_api_video_transcripts'
|
||||
),
|
||||
re_path(
|
||||
fr'^xblock/{settings.COURSE_ID_PATTERN}$',
|
||||
@@ -78,4 +85,12 @@ urlpatterns = [
|
||||
fr'^xblock/{settings.COURSE_ID_PATTERN}/{settings.USAGE_KEY_PATTERN}$',
|
||||
xblock.XblockView.as_view(), name='cms_api_xblock'
|
||||
),
|
||||
re_path(
|
||||
fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/check?$',
|
||||
YoutubeTranscriptCheckView.as_view(), name='cms_api_youtube_transcripts_check'
|
||||
),
|
||||
re_path(
|
||||
fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/upload?$',
|
||||
YoutubeTranscriptUploadView.as_view(), name='cms_api_youtube_transcripts_upload'
|
||||
),
|
||||
]
|
||||
|
||||
@@ -3,3 +3,4 @@ Views for v0 contentstore API.
|
||||
"""
|
||||
from .advanced_settings import AdvancedCourseSettingsView
|
||||
from .tabs import CourseTabSettingsView, CourseTabListView, CourseTabReorderView
|
||||
from .transcripts import TranscriptView, YoutubeTranscriptCheckView, YoutubeTranscriptUploadView
|
||||
|
||||
@@ -14,14 +14,14 @@ from openedx.core.lib.api.view_utils import DeveloperErrorViewMixin, view_auth_c
|
||||
from common.djangoapps.util.json_request import expect_json_in_class_view
|
||||
|
||||
from cms.djangoapps.contentstore.api import course_author_access_required
|
||||
|
||||
from cms.djangoapps.contentstore.views.transcripts_ajax import check_transcripts, replace_transcripts
|
||||
from cms.djangoapps.contentstore.transcript_storage_handlers import (
|
||||
upload_transcript,
|
||||
delete_video_transcript_or_404,
|
||||
handle_transcript_download,
|
||||
)
|
||||
import cms.djangoapps.contentstore.toggles as contentstore_toggles
|
||||
from ..serializers import TranscriptSerializer
|
||||
from ..serializers import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer
|
||||
from rest_framework.parsers import (MultiPartParser, FormParser)
|
||||
from openedx.core.lib.api.parsers import TypedFileUploadParser
|
||||
|
||||
@@ -68,3 +68,50 @@ class TranscriptView(DeveloperErrorViewMixin, CreateAPIView, RetrieveAPIView, De
|
||||
"""
|
||||
|
||||
return delete_video_transcript_or_404(request)
|
||||
|
||||
|
||||
@view_auth_classes()
|
||||
class YoutubeTranscriptCheckView(DeveloperErrorViewMixin, RetrieveAPIView):
|
||||
"""
|
||||
public rest API endpoints for the CMS API YouTube transcripts.
|
||||
youtube_id: required argument, needed to authorize course authors and identify the video.
|
||||
edx_video_id: required argument, needed to identify the transcript.
|
||||
xblock_id: required argument, needed to identify the transcript.
|
||||
"""
|
||||
serializer_class = YoutubeTranscriptCheckSerializer
|
||||
parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser)
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
if not toggles.use_studio_content_api():
|
||||
raise Http404
|
||||
return super().dispatch(request, *args, **kwargs)
|
||||
|
||||
@course_author_access_required
|
||||
def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ
|
||||
"""
|
||||
Get the status of youtube transcripts for a given youtube video
|
||||
"""
|
||||
return check_transcripts(request)
|
||||
|
||||
|
||||
@view_auth_classes()
|
||||
class YoutubeTranscriptUploadView(DeveloperErrorViewMixin, RetrieveAPIView):
|
||||
"""
|
||||
public rest API endpoints for the CMS API YouTube transcripts.
|
||||
youtube_id: required argument, needed to authorize course authors and identify the video.
|
||||
xblock_id: required argument, needed to identify the transcript.
|
||||
"""
|
||||
serializer_class = YoutubeTranscriptUploadSerializer
|
||||
parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser)
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
if not toggles.use_studio_content_api():
|
||||
raise Http404
|
||||
return super().dispatch(request, *args, **kwargs)
|
||||
|
||||
@course_author_access_required
|
||||
def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ
|
||||
"""
|
||||
Get the youtube transcripts for a give youtube video and add them to video block
|
||||
"""
|
||||
return replace_transcripts(request)
|
||||
|
||||
@@ -464,14 +464,16 @@ class TestYoutubeTranscripts(unittest.TestCase):
|
||||
setup_caption_responses(mock_get, 'en', 'test', track_status_code)
|
||||
youtube_id = 'bad_youtube_id'
|
||||
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
|
||||
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
|
||||
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
|
||||
transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation)
|
||||
|
||||
@patch('xmodule.video_block.transcripts_utils.requests.get')
|
||||
def test_youtube_empty_text(self, mock_get):
|
||||
setup_caption_responses(mock_get, 'en', '')
|
||||
youtube_id = 'bad_youtube_id'
|
||||
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
|
||||
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
|
||||
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
|
||||
transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation)
|
||||
|
||||
def test_youtube_good_result(self):
|
||||
caption_response_string = textwrap.dedent("""<?xml version="1.0" encoding="utf-8" ?>
|
||||
@@ -491,7 +493,8 @@ class TestYoutubeTranscripts(unittest.TestCase):
|
||||
language_code = 'en'
|
||||
with patch('xmodule.video_block.transcripts_utils.requests.get') as mock_get:
|
||||
setup_caption_responses(mock_get, language_code, caption_response_string)
|
||||
transcripts = transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
|
||||
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
|
||||
transcripts = transcripts_utils.get_transcript_from_youtube(link['en'], youtube_id, translation)
|
||||
|
||||
self.assertEqual(transcripts, expected_transcripts)
|
||||
self.assertEqual(2, len(mock_get.mock_calls))
|
||||
|
||||
@@ -628,7 +628,7 @@ class TestRenameTranscripts(BaseTranscripts):
|
||||
@ddt.ddt
|
||||
@patch(
|
||||
'cms.djangoapps.contentstore.views.transcripts_ajax.download_youtube_subs',
|
||||
Mock(return_value=SJSON_TRANSCRIPT_CONTENT)
|
||||
Mock(return_value=[['en', SJSON_TRANSCRIPT_CONTENT]])
|
||||
)
|
||||
class TestReplaceTranscripts(BaseTranscripts):
|
||||
"""
|
||||
|
||||
@@ -39,8 +39,9 @@ from xmodule.video_block.transcripts_utils import ( # lint-amnesty, pylint: dis
|
||||
get_transcript,
|
||||
get_transcript_for_video,
|
||||
get_transcript_from_val,
|
||||
get_transcripts_from_youtube,
|
||||
get_transcript_link_from_youtube
|
||||
get_transcript_from_youtube,
|
||||
get_transcript_link_from_youtube,
|
||||
get_transcript_links_from_youtube,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@@ -345,13 +346,17 @@ def check_transcripts(request): # lint-amnesty, pylint: disable=too-many-statem
|
||||
#check youtube local and server transcripts for equality
|
||||
if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']:
|
||||
try:
|
||||
youtube_server_subs = get_transcripts_from_youtube(
|
||||
transcript_links = get_transcript_links_from_youtube(
|
||||
youtube_id,
|
||||
settings,
|
||||
item.runtime.service(item, "i18n")
|
||||
)
|
||||
if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality
|
||||
transcripts_presence['youtube_diff'] = False
|
||||
for (_, link) in transcript_links.items():
|
||||
youtube_server_subs = get_transcript_from_youtube(
|
||||
link, youtube_id, item.runtime.service(item, "i18n")
|
||||
)
|
||||
if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality
|
||||
transcripts_presence['youtube_diff'] = False
|
||||
except GetTranscriptsFromYouTubeException:
|
||||
pass
|
||||
|
||||
@@ -450,7 +455,6 @@ def _validate_transcripts_data(request):
|
||||
data = json.loads(request.GET.get('data', '{}'))
|
||||
if not data:
|
||||
raise TranscriptsRequestValidationException(_('Incoming video data is empty.'))
|
||||
|
||||
try:
|
||||
item = _get_item(request, data)
|
||||
except (InvalidKeyError, ItemNotFoundError):
|
||||
@@ -512,7 +516,6 @@ def validate_transcripts_request(request, include_yt=False, include_html5=False)
|
||||
for video in videos
|
||||
if video['type'] != 'youtube'
|
||||
}
|
||||
|
||||
return error, validated_data
|
||||
|
||||
|
||||
@@ -622,8 +625,13 @@ def replace_transcripts(request):
|
||||
# 2. Link a video to video component if its not already linked to one.
|
||||
edx_video_id = link_video_to_component(video, request.user)
|
||||
|
||||
# for transcript in transcript_links:
|
||||
|
||||
# 3. Upload YT transcript to DS for the linked video ID.
|
||||
success = save_video_transcript(edx_video_id, Transcript.SJSON, transcript_content, language_code='en')
|
||||
success = True
|
||||
for transcript in transcript_content:
|
||||
[language_code, json_content] = transcript
|
||||
success = save_video_transcript(edx_video_id, Transcript.SJSON, json_content, language_code)
|
||||
if success:
|
||||
response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200)
|
||||
else:
|
||||
|
||||
@@ -15,7 +15,8 @@ def cms_api_filter(endpoints):
|
||||
path.startswith("/api/contentstore/v0/xblock") or
|
||||
path.startswith("/api/contentstore/v0/videos") or
|
||||
path.startswith("/api/contentstore/v0/video_transcripts") or
|
||||
path.startswith("/api/contentstore/v0/file_assets")
|
||||
path.startswith("/api/contentstore/v0/file_assets") or
|
||||
path.startswith("/api/contentstore/v0/youtube_transcripts")
|
||||
):
|
||||
filtered.append((path, path_regex, method, callback))
|
||||
return filtered
|
||||
|
||||
@@ -110,21 +110,24 @@ class TranscriptsUtilsTest(TestCase):
|
||||
"""
|
||||
|
||||
@mock.patch('requests.get')
|
||||
@ddt.data("en", "en-US", "en-GB")
|
||||
@ddt.data("en", "en-US", "en-GB", 'fr')
|
||||
def test_get_transcript_link_from_youtube(self, language_code, mock_get):
|
||||
"""
|
||||
Happy path test: english caption link returned when video page HTML has one english caption
|
||||
Happy path test: dict of caption links returned when video page HTML has at least one caption
|
||||
"""
|
||||
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
|
||||
|
||||
language_specific_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
||||
self.assertEqual(language_specific_caption_link, CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code))
|
||||
self.assertEqual(
|
||||
language_specific_caption_link,
|
||||
{language_code: CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code)}
|
||||
)
|
||||
|
||||
@ mock.patch('requests.get')
|
||||
@ddt.data("fr", None)
|
||||
def test_get_caption_no_english_caption(self, language_code, mock_get):
|
||||
@ddt.data(None)
|
||||
def test_get_caption_no_caption(self, language_code, mock_get):
|
||||
"""
|
||||
No caption link returned when video page HTML contains no caption in English
|
||||
No caption link returned when video page HTML contains no caption
|
||||
"""
|
||||
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
|
||||
|
||||
|
||||
@@ -180,19 +180,22 @@ def get_transcript_link_from_youtube(youtube_id):
|
||||
try:
|
||||
youtube_html = requests.get(f"{youtube_url_base}{youtube_id}")
|
||||
caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX']
|
||||
allowed_language_codes = settings.YOUTUBE['TRANSCRIPTS']['ALLOWED_LANGUAGE_CODES']
|
||||
caption_matched = re.search(caption_re, youtube_html.content.decode("utf-8"))
|
||||
if caption_matched:
|
||||
caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]')
|
||||
caption_links = {}
|
||||
for caption in caption_tracks:
|
||||
if "languageCode" in caption.keys() and caption["languageCode"] in allowed_language_codes:
|
||||
return caption.get("baseUrl")
|
||||
language_code = caption.get('languageCode', None)
|
||||
if language_code and not language_code == 'None':
|
||||
link = caption.get("baseUrl")
|
||||
caption_links[language_code] = link
|
||||
return None if not caption_links else caption_links
|
||||
return None
|
||||
except ConnectionError:
|
||||
return None
|
||||
|
||||
|
||||
def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name
|
||||
def get_transcript_links_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name
|
||||
"""
|
||||
Gets transcripts from youtube for youtube_id.
|
||||
|
||||
@@ -202,18 +205,29 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_
|
||||
Returns (status, transcripts): bool, dict.
|
||||
"""
|
||||
_ = i18n.gettext
|
||||
transcript_links = get_transcript_link_from_youtube(youtube_id)
|
||||
|
||||
utf8_parser = etree.XMLParser(encoding='utf-8')
|
||||
|
||||
transcript_link = get_transcript_link_from_youtube(youtube_id)
|
||||
|
||||
if not transcript_link:
|
||||
if not transcript_links:
|
||||
msg = _("Can't get transcript link from Youtube for {youtube_id}.").format(
|
||||
youtube_id=youtube_id,
|
||||
)
|
||||
raise GetTranscriptsFromYouTubeException(msg)
|
||||
|
||||
data = requests.get(transcript_link)
|
||||
return transcript_links
|
||||
|
||||
|
||||
def get_transcript_from_youtube(link, youtube_id, i18n):
|
||||
"""
|
||||
Gets transcripts from youtube for youtube_id.
|
||||
|
||||
Parses only utf-8 encoded transcripts.
|
||||
Other encodings are not supported at the moment.
|
||||
|
||||
Returns (status, transcripts): bool, dict.
|
||||
"""
|
||||
_ = i18n.gettext
|
||||
utf8_parser = etree.XMLParser(encoding='utf-8')
|
||||
data = requests.get(link)
|
||||
|
||||
if data.status_code != 200 or not data.text:
|
||||
msg = _("Can't receive transcripts from Youtube for {youtube_id}. Status code: {status_code}.").format(
|
||||
@@ -258,9 +272,12 @@ def download_youtube_subs(youtube_id, video_block, settings): # lint-amnesty, p
|
||||
"""
|
||||
i18n = video_block.runtime.service(video_block, "i18n")
|
||||
_ = i18n.gettext
|
||||
|
||||
subs = get_transcripts_from_youtube(youtube_id, settings, i18n)
|
||||
return json.dumps(subs, indent=2)
|
||||
transcript_links = get_transcript_links_from_youtube(youtube_id, settings, i18n)
|
||||
subs = []
|
||||
for (language_code, link) in transcript_links.items():
|
||||
sub = get_transcript_from_youtube(link, youtube_id, i18n)
|
||||
subs.append([language_code, json.dumps(sub, indent=2)])
|
||||
return subs
|
||||
|
||||
|
||||
def remove_subs_from_store(subs_id, item, lang='en'):
|
||||
|
||||
Reference in New Issue
Block a user