From 44158ec8449412f7f20008b5282ae0bfb31dab9e Mon Sep 17 00:00:00 2001 From: "iamcristye@outlook.com" Date: Tue, 1 Nov 2022 16:06:47 +0800 Subject: [PATCH] fix: importing transcript from YouTube TNL-9460 Co-authored-by: Crist Ye --- .../contentstore/views/transcripts_ajax.py | 14 +--- cms/envs/common.py | 5 ++ lms/envs/common.py | 5 ++ xmodule/video_module/transcripts_utils.py | 81 ++++++++++++++++--- 4 files changed, 84 insertions(+), 21 deletions(-) diff --git a/cms/djangoapps/contentstore/views/transcripts_ajax.py b/cms/djangoapps/contentstore/views/transcripts_ajax.py index a3b91b6101..cf322ff45b 100644 --- a/cms/djangoapps/contentstore/views/transcripts_ajax.py +++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py @@ -7,12 +7,10 @@ All user changes are saved immediately. """ -import copy import json import logging import os -import requests from django.conf import settings from django.contrib.auth.decorators import login_required from django.core.exceptions import PermissionDenied @@ -42,7 +40,7 @@ from xmodule.video_module.transcripts_utils import ( # lint-amnesty, pylint: di get_transcript_for_video, get_transcript_from_val, get_transcripts_from_youtube, - youtube_video_transcript_name + get_transcript_link_from_youtube ) __all__ = [ @@ -340,15 +338,7 @@ def check_transcripts(request): # lint-amnesty, pylint: disable=too-many-statem except NotFoundError: log.debug("Can't find transcripts in storage for youtube id: %s", youtube_id) - # youtube server - youtube_text_api = copy.deepcopy(settings.YOUTUBE['TEXT_API']) - youtube_text_api['params']['v'] = youtube_id - youtube_transcript_name = youtube_video_transcript_name(youtube_text_api) - if youtube_transcript_name: - youtube_text_api['params']['name'] = youtube_transcript_name - youtube_response = requests.get('http://' + youtube_text_api['url'], params=youtube_text_api['params']) - - if youtube_response.status_code == 200 and youtube_response.text: + if get_transcript_link_from_youtube(youtube_id): transcripts_presence['youtube_server'] = True #check youtube local and server transcripts for equality if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']: diff --git a/cms/envs/common.py b/cms/envs/common.py index cb5e32491f..9eaabe6c32 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -1526,6 +1526,11 @@ YOUTUBE = { }, }, + 'TRANSCRIPTS': { + 'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P[^\]]+)", + 'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=', + }, + 'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080 } diff --git a/lms/envs/common.py b/lms/envs/common.py index 0b1fe87fcf..6a5cd8a5ea 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -2916,6 +2916,11 @@ YOUTUBE = { }, }, + 'TRANSCRIPTS': { + 'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P[^\]]+)", + 'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=', + }, + 'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080 } YOUTUBE_API_KEY = 'PUT_YOUR_API_KEY_HERE' diff --git a/xmodule/video_module/transcripts_utils.py b/xmodule/video_module/transcripts_utils.py index c33b70a2ad..17e654ea46 100644 --- a/xmodule/video_module/transcripts_utils.py +++ b/xmodule/video_module/transcripts_utils.py @@ -8,6 +8,7 @@ import copy import html import logging import os +import re from functools import wraps import requests @@ -153,6 +154,68 @@ def youtube_video_transcript_name(youtube_text_api): return None +def get_transcript_link_from_youtube(youtube_id): + """ + Get the link for YouTube transcript by parsing the source of the YouTube webpage. + Inside the webpage, the details of the transcripts are located in a JSON object. + After prettifying the object, it looks like: + + "captions": { + "playerCaptionsTracklistRenderer": { + "captionTracks": [ + { + "baseUrl": "...", + "name": { + "simpleText": "(Japanese in local language)" + }, + "vssId": ".ja", + "languageCode": "ja", + "isTranslatable": true + }, + { + "baseUrl": "...", + "name": { + "simpleText": "(French in local language)" + }, + "vssId": ".fr", + "languageCode": "fr", + "isTranslatable": true + }, + { + "baseUrl": "...", + "name": { + "simpleText": "(English in local language)" + }, + "vssId": ".en", + "languageCode": "en", + "isTranslatable": true + }, + ... + ], + "audioTracks": [...] + "translationLanguages": ... + }, + ... + } + + So we use a regex to find the captionTracks JavaScript array, and then convert it + to a Python dict and return the link for en caption + """ + youtube_url_base = settings.YOUTUBE['TRANSCRIPTS']['YOUTUBE_URL_BASE'] + try: + youtube_html = requests.get(f"{youtube_url_base}{youtube_id}") + caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX'] + caption_matched = caption_re.search(youtube_html.content.decode("utf-8")) + if caption_matched: + caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]') + for caption in caption_tracks: + if caption["languageCode"] == "en": + return caption["baseUrl"] + return None + except ConnectionError: + return None + + def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name """ Gets transcripts from youtube for youtube_id. @@ -166,15 +229,15 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_ utf8_parser = etree.XMLParser(encoding='utf-8') - youtube_text_api = copy.deepcopy(settings.YOUTUBE['TEXT_API']) - youtube_text_api['params']['v'] = youtube_id - # if the transcript name is not empty on youtube server we have to pass - # name param in url in order to get transcript - # example http://video.google.com/timedtext?lang=en&v={VideoId}&name={transcript_name} - youtube_transcript_name = youtube_video_transcript_name(youtube_text_api) - if youtube_transcript_name: - youtube_text_api['params']['name'] = youtube_transcript_name - data = requests.get('http://' + youtube_text_api['url'], params=youtube_text_api['params']) + transcript_link = get_transcript_link_from_youtube(youtube_id) + + if not transcript_link: + msg = _("Can't get transcript link from Youtube for {youtube_id}.").format( + youtube_id=youtube_id, + ) + raise GetTranscriptsFromYouTubeException(msg) + + data = requests.get(transcript_link) if data.status_code != 200 or not data.text: msg = _("Can't receive transcripts from Youtube for {youtube_id}. Status code: {status_code}.").format(