feat: configure acceptable language codes for youtube transcripts

This commit is contained in:
Navin Karkera
2023-06-24 20:17:41 +05:30
parent 437418d367
commit 93006b476d
4 changed files with 13 additions and 18 deletions

View File

@@ -1536,6 +1536,7 @@ YOUTUBE = {
'TRANSCRIPTS': {
'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P<caption_tracks>[^\]]+)",
'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=',
'ALLOWED_LANGUAGE_CODES': ["en", "en-US", "en-GB"],
},
'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080

View File

@@ -2949,6 +2949,7 @@ YOUTUBE = {
'TRANSCRIPTS': {
'CAPTION_TRACKS_REGEX': r"captionTracks\"\:\[(?P<caption_tracks>[^\]]+)",
'YOUTUBE_URL_BASE': 'https://www.youtube.com/watch?v=',
'ALLOWED_LANGUAGE_CODES': ["en", "en-US", "en-GB"],
},
'IMAGE_API': 'http://img.youtube.com/vi/{youtube_id}/0.jpg', # /maxresdefault.jpg for 1920*1080

View File

@@ -7,10 +7,11 @@ These tests follow the following nomenclature:
- among the fields found in a track descriptor is a caption URL (aka caption link)
- use this link to obtain the track's caption data
'''
from ..video_block.transcripts_utils import get_transcript_link_from_youtube
from unittest import mock, TestCase
import ddt
from ..video_block.transcripts_utils import get_transcript_link_from_youtube
YOUTUBE_VIDEO_ID = "z-LoKnweV6w"
@@ -102,43 +103,34 @@ class YoutubeVideoHTMLResponse:
self.content = bytearray(youtube_html, 'UTF-8')
@ddt.ddt
class TranscriptsUtilsTest(TestCase):
"""
Tests utility fucntions for transcripts (in video_block)
"""
@mock.patch('requests.get')
def test_get_transcript_link_from_youtube(self, mock_get):
@ddt.data("en", "en-US", "en-GB")
def test_get_transcript_link_from_youtube(self, language_code, mock_get):
"""
Happy path test: english caption link returned when video page HTML has one english caption
"""
language_code = 'en'
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
language_specific_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
self.assertEqual(language_specific_caption_link, CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code))
@ mock.patch('requests.get')
def test_get_caption_no_english_caption(self, mock_get):
@ddt.data("fr", None)
def test_get_caption_no_english_caption(self, language_code, mock_get):
"""
No caption link returned when video page HTML contains no caption in English
"""
language_code = 'fr'
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
english_language_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
self.assertIsNone(english_language_caption_link)
@ mock.patch('requests.get')
def test_get_caption_no_captions_in_HTML(self, mock_get):
"""
No caption link returned when video page HTML contains no captions at all
"""
mock_get.return_value = YoutubeVideoHTMLResponse.with_no_caption_tracks()
english_language_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
self.assertEqual(english_language_caption_link, None)
@ mock.patch('requests.get')
def test_get_caption_malformed_caption_locator(self, mock_get):
"""

View File

@@ -182,12 +182,13 @@ def get_transcript_link_from_youtube(youtube_id):
try:
youtube_html = requests.get(f"{youtube_url_base}{youtube_id}")
caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX']
allowed_language_codes = settings.YOUTUBE['TRANSCRIPTS']['ALLOWED_LANGUAGE_CODES']
caption_matched = re.search(caption_re, youtube_html.content.decode("utf-8"))
if caption_matched:
caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]')
for caption in caption_tracks:
if "languageCode" in caption.keys() and caption["languageCode"] == "en":
return caption["baseUrl"]
if "languageCode" in caption.keys() and caption["languageCode"] in allowed_language_codes:
return caption.get("baseUrl")
return None
except ConnectionError:
return None