fix: importing transcript from YouTube TNL-9460

Co-authored-by: Crist Ye  <iamCristYe@Outlook.com>
This commit is contained in:
iamcristye@outlook.com
2022-11-01 16:06:47 +08:00
committed by Bernard Szabo
parent 5bb84e9962
commit 44158ec844
4 changed files with 84 additions and 21 deletions

View File

@@ -7,12 +7,10 @@ All user changes are saved immediately.
"""
import copy
import json
import logging
import os
import requests
from django.conf import settings
from django.contrib.auth.decorators import login_required
from django.core.exceptions import PermissionDenied
@@ -42,7 +40,7 @@ from xmodule.video_module.transcripts_utils import ( # lint-amnesty, pylint: di
get_transcript_for_video,
get_transcript_from_val,
get_transcripts_from_youtube,
youtube_video_transcript_name
get_transcript_link_from_youtube
)
__all__ = [
@@ -340,15 +338,7 @@ def check_transcripts(request): # lint-amnesty, pylint: disable=too-many-statem
except NotFoundError:
log.debug("Can't find transcripts in storage for youtube id: %s", youtube_id)
# youtube server
youtube_text_api = copy.deepcopy(settings.YOUTUBE['TEXT_API'])
youtube_text_api['params']['v'] = youtube_id
youtube_transcript_name = youtube_video_transcript_name(youtube_text_api)
if youtube_transcript_name:
youtube_text_api['params']['name'] = youtube_transcript_name
youtube_response = requests.get('http://' + youtube_text_api['url'], params=youtube_text_api['params'])
if youtube_response.status_code == 200 and youtube_response.text:
if get_transcript_link_from_youtube(youtube_id):
transcripts_presence['youtube_server'] = True
#check youtube local and server transcripts for equality
if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']:

View File

@@ -1526,6 +1526,11 @@ YOUTUBE = {
},
},
'TRANSCRIPTS': {
'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P<caption_tracks>[^\]]+)",
'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=',
},
'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080
}

View File

@@ -2916,6 +2916,11 @@ YOUTUBE = {
},
},
'TRANSCRIPTS': {
'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P<caption_tracks>[^\]]+)",
'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=',
},
'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080
}
YOUTUBE_API_KEY = 'PUT_YOUR_API_KEY_HERE'

View File

@@ -8,6 +8,7 @@ import copy
import html
import logging
import os
import re
from functools import wraps
import requests
@@ -153,6 +154,68 @@ def youtube_video_transcript_name(youtube_text_api):
return None
def get_transcript_link_from_youtube(youtube_id):
"""
Get the link for YouTube transcript by parsing the source of the YouTube webpage.
Inside the webpage, the details of the transcripts are located in a JSON object.
After prettifying the object, it looks like:
"captions": {
"playerCaptionsTracklistRenderer": {
"captionTracks": [
{
"baseUrl": "...",
"name": {
"simpleText": "(Japanese in local language)"
},
"vssId": ".ja",
"languageCode": "ja",
"isTranslatable": true
},
{
"baseUrl": "...",
"name": {
"simpleText": "(French in local language)"
},
"vssId": ".fr",
"languageCode": "fr",
"isTranslatable": true
},
{
"baseUrl": "...",
"name": {
"simpleText": "(English in local language)"
},
"vssId": ".en",
"languageCode": "en",
"isTranslatable": true
},
...
],
"audioTracks": [...]
"translationLanguages": ...
},
...
}
So we use a regex to find the captionTracks JavaScript array, and then convert it
to a Python dict and return the link for en caption
"""
youtube_url_base = settings.YOUTUBE['TRANSCRIPTS']['YOUTUBE_URL_BASE']
try:
youtube_html = requests.get(f"{youtube_url_base}{youtube_id}")
caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX']
caption_matched = caption_re.search(youtube_html.content.decode("utf-8"))
if caption_matched:
caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]')
for caption in caption_tracks:
if caption["languageCode"] == "en":
return caption["baseUrl"]
return None
except ConnectionError:
return None
def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name
"""
Gets transcripts from youtube for youtube_id.
@@ -166,15 +229,15 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_
utf8_parser = etree.XMLParser(encoding='utf-8')
youtube_text_api = copy.deepcopy(settings.YOUTUBE['TEXT_API'])
youtube_text_api['params']['v'] = youtube_id
# if the transcript name is not empty on youtube server we have to pass
# name param in url in order to get transcript
# example http://video.google.com/timedtext?lang=en&v={VideoId}&name={transcript_name}
youtube_transcript_name = youtube_video_transcript_name(youtube_text_api)
if youtube_transcript_name:
youtube_text_api['params']['name'] = youtube_transcript_name
data = requests.get('http://' + youtube_text_api['url'], params=youtube_text_api['params'])
transcript_link = get_transcript_link_from_youtube(youtube_id)
if not transcript_link:
msg = _("Can't get transcript link from Youtube for {youtube_id}.").format(
youtube_id=youtube_id,
)
raise GetTranscriptsFromYouTubeException(msg)
data = requests.get(transcript_link)
if data.status_code != 200 or not data.text:
msg = _("Can't receive transcripts from Youtube for {youtube_id}. Status code: {status_code}.").format(