feat: configure acceptable language codes for youtube transcripts
This commit is contained in:
@@ -1536,6 +1536,7 @@ YOUTUBE = {
|
||||
'TRANSCRIPTS': {
|
||||
'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P<caption_tracks>[^\]]+)",
|
||||
'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=',
|
||||
'ALLOWED_LANGUAGE_CODES': ["en", "en-US", "en-GB"],
|
||||
},
|
||||
|
||||
'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080
|
||||
|
||||
@@ -2949,6 +2949,7 @@ YOUTUBE = {
|
||||
'TRANSCRIPTS': {
|
||||
'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P<caption_tracks>[^\]]+)",
|
||||
'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=',
|
||||
'ALLOWED_LANGUAGE_CODES': ["en", "en-US", "en-GB"],
|
||||
},
|
||||
|
||||
'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080
|
||||
|
||||
@@ -7,10 +7,11 @@ These tests follow the following nomenclature:
|
||||
- among the fields found in a track descriptor is a caption URL (aka caption link)
|
||||
- use this link to obtain the track's caption data
|
||||
'''
|
||||
from ..video_block.transcripts_utils import get_transcript_link_from_youtube
|
||||
|
||||
from unittest import mock, TestCase
|
||||
|
||||
import ddt
|
||||
|
||||
from ..video_block.transcripts_utils import get_transcript_link_from_youtube
|
||||
|
||||
YOUTUBE_VIDEO_ID = "z-LoKnweV6w"
|
||||
|
||||
@@ -102,43 +103,34 @@ class YoutubeVideoHTMLResponse:
|
||||
self.content = bytearray(youtube_html, 'UTF-8')
|
||||
|
||||
|
||||
@ddt.ddt
|
||||
class TranscriptsUtilsTest(TestCase):
|
||||
"""
|
||||
Tests utility fucntions for transcripts (in video_block)
|
||||
"""
|
||||
|
||||
@mock.patch('requests.get')
|
||||
def test_get_transcript_link_from_youtube(self, mock_get):
|
||||
@ddt.data("en", "en-US", "en-GB")
|
||||
def test_get_transcript_link_from_youtube(self, language_code, mock_get):
|
||||
"""
|
||||
Happy path test: english caption link returned when video page HTML has one english caption
|
||||
"""
|
||||
language_code = 'en'
|
||||
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
|
||||
|
||||
language_specific_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
||||
self.assertEqual(language_specific_caption_link, CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code))
|
||||
|
||||
@ mock.patch('requests.get')
|
||||
def test_get_caption_no_english_caption(self, mock_get):
|
||||
@ddt.data("fr", None)
|
||||
def test_get_caption_no_english_caption(self, language_code, mock_get):
|
||||
"""
|
||||
No caption link returned when video page HTML contains no caption in English
|
||||
"""
|
||||
language_code = 'fr'
|
||||
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
|
||||
|
||||
english_language_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
||||
self.assertIsNone(english_language_caption_link)
|
||||
|
||||
@ mock.patch('requests.get')
|
||||
def test_get_caption_no_captions_in_HTML(self, mock_get):
|
||||
"""
|
||||
No caption link returned when video page HTML contains no captions at all
|
||||
"""
|
||||
mock_get.return_value = YoutubeVideoHTMLResponse.with_no_caption_tracks()
|
||||
|
||||
english_language_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
||||
self.assertEqual(english_language_caption_link, None)
|
||||
|
||||
@ mock.patch('requests.get')
|
||||
def test_get_caption_malformed_caption_locator(self, mock_get):
|
||||
"""
|
||||
|
||||
@@ -182,12 +182,13 @@ def get_transcript_link_from_youtube(youtube_id):
|
||||
try:
|
||||
youtube_html = requests.get(f"{youtube_url_base}{youtube_id}")
|
||||
caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX']
|
||||
allowed_language_codes = settings.YOUTUBE['TRANSCRIPTS']['ALLOWED_LANGUAGE_CODES']
|
||||
caption_matched = re.search(caption_re, youtube_html.content.decode("utf-8"))
|
||||
if caption_matched:
|
||||
caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]')
|
||||
for caption in caption_tracks:
|
||||
if "languageCode" in caption.keys() and caption["languageCode"] == "en":
|
||||
return caption["baseUrl"]
|
||||
if "languageCode" in caption.keys() and caption["languageCode"] in allowed_language_codes:
|
||||
return caption.get("baseUrl")
|
||||
return None
|
||||
except ConnectionError:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user