CMS youtube transcript tests call GET twice & need different responses on each of the two calls Current solution (setup_caption_responses) decides what to return on basis of call number. Former solution (mock_request_get()) decided what to return on the basis of kwargs, which would differ on first vs. second call
152 lines
6.8 KiB
Python
152 lines
6.8 KiB
Python
''' Tests mechanism for obtaining language-specific transcript links from youtube video pages
|
|
Note that tests that work with these links are located elsewhere (test_video.py)
|
|
|
|
These tests follow the following nomenclature:
|
|
- a youtube video page supports one or more caption languages (aka 'tracks')
|
|
- embedded in the page's HTML are track descriptors
|
|
- among the fields found in a track descriptor is a caption URL (aka caption link)
|
|
- use this link to obtain the track's caption data
|
|
'''
|
|
from ..video_module.transcripts_utils import get_transcript_link_from_youtube
|
|
|
|
from unittest import mock, TestCase
|
|
|
|
|
|
YOUTUBE_VIDEO_ID = "z-LoKnweV6w"
|
|
|
|
# Use CAPTION_URL_TEMPLATE.format(<youtube_video_ID>, <ampersand_encoding>, <language_code>)
|
|
# to get either the HTML rendition or the json rendition of the link used to obtain a youtube video's subtitles
|
|
#
|
|
# Parameterized with
|
|
# {0} - The youtube video ID whose captions you want
|
|
# {1} - Either \u0026 for use with UTF-8 encoded HTML, or '&' for use with json
|
|
# {2} - Language code (e.g., "en")
|
|
CAPTION_URL_TEMPLATE = "https://www.youtube.com/api/timedtext?v = {0}{1}caps = asr{1}xoaf = 5{1}hl = {2}{1}\
|
|
ip = 0.0.0.0{1}ipbits = 0{1}expire = 1667281544{1}sparams = ip, ipbits, expire, v, caps, xoaf{1}\
|
|
signature = 3A2A34F0A1FB11B3825FF54D4238B6CC415877E8.058892{1}key = yt8{1}kind = asr{1}lang = {2}"
|
|
|
|
UTF8_AMPERSAND = '\\u0026'
|
|
|
|
# These caption link templates have tailored uses and parameterize on the language code only
|
|
#
|
|
# Parameterized with
|
|
# {0} - Language code (e.g., "en")
|
|
CAPTION_URL_UTF8_ENCODED_TEMPLATE = CAPTION_URL_TEMPLATE.format(YOUTUBE_VIDEO_ID, UTF8_AMPERSAND, "{0}")
|
|
CAPTION_URL_UTF8_DECODED_TEMPLATE = CAPTION_URL_TEMPLATE.format(YOUTUBE_VIDEO_ID, "&", "{0}")
|
|
|
|
# Macro providing the HTML returned by our mock GET operation on the youtube video page
|
|
#
|
|
# This template is hard-wired for a video with a single language track
|
|
#
|
|
# This is not valid HTML, but that's OK, as we'll only be using it to confirm the regex
|
|
# search on the 'playerCaptionsTrackListRenderer' subtree.
|
|
#
|
|
# Packed quality (i.e., no spaces) is essential for these tests to work! Introduction
|
|
# of spaces before or after the colon signs causes the regex matching to stop working.
|
|
#
|
|
# Parameterized with
|
|
# {0} - the URL that obtains the selected video's caption
|
|
# {1} - Language code (e.g., "en")
|
|
YOUTUBE_HTML_TEMPLATE = "<b>HTML content that comes before the caption tracks...</b>" \
|
|
"\"captions\":{{\"playerCaptionsTracklistRenderer\":" \
|
|
"{{\"captionTracks\":[{{\"baseUrl\":\"{0}\"," \
|
|
"\"name\":{{\"simpleText\":\"(Caption language name in local language)\"}}," \
|
|
"\"vssId\":\".{1}\",\"languageCode\":\"{1}\"," \
|
|
"\"isTranslatable\":true}}]}}}}<b>HTML content that comes after the caption tracks ...</b>"
|
|
|
|
|
|
class YoutubeVideoHTMLResponse:
|
|
"""
|
|
Generates substitute HTTP GET responses used when mocking the GET operation to a youtube video page
|
|
"""
|
|
|
|
@classmethod
|
|
def with_caption_track(cls, language_code):
|
|
"""
|
|
Generates a GET response of HTML with a single caption track for the specified
|
|
language code language_code = "en" for english
|
|
"""
|
|
caption_link = CAPTION_URL_UTF8_ENCODED_TEMPLATE.format(language_code)
|
|
html_with_single_caption_track = YOUTUBE_HTML_TEMPLATE.format(caption_link, language_code)
|
|
return cls.MockResponse(html_with_single_caption_track)
|
|
|
|
@classmethod
|
|
def with_no_caption_tracks(cls):
|
|
"""
|
|
Generates a GET response of (invalid) HTML lacking any caption tracks within it.
|
|
This fake HTML is nevered rendered; it's only intended as a source for a regex
|
|
search
|
|
"""
|
|
html_with_no_caption_tracks = "<b>No caption URL info for regex to find here</b>"
|
|
return cls.MockResponse(html_with_no_caption_tracks)
|
|
|
|
@classmethod
|
|
def with_malformed_caption_track(cls, language_code):
|
|
"""
|
|
Generates a GET response of HTML with a single caption of the specified
|
|
language code language_code = "en" for english
|
|
"""
|
|
caption_link = CAPTION_URL_UTF8_ENCODED_TEMPLATE.format(language_code)
|
|
html_with_single_valid_caption_track = YOUTUBE_HTML_TEMPLATE.format(caption_link, language_code)
|
|
html_with_single_malformed_caption_track = \
|
|
html_with_single_valid_caption_track.replace('languageCode', 'bogus_key')
|
|
return cls.MockResponse(html_with_single_malformed_caption_track)
|
|
|
|
class MockResponse:
|
|
"""
|
|
An object fit to be returned from a an HTTP GET operation, exposing
|
|
a UTF-8 encoded version of the youtube_html input string in its content attribute
|
|
"""
|
|
def __init__(self, youtube_html):
|
|
self.status_code = 200
|
|
self.content = bytearray(youtube_html, 'UTF-8')
|
|
|
|
|
|
class TranscriptsUtilsTest(TestCase):
|
|
"""
|
|
Tests utility fucntions for transcripts (in video_module)
|
|
"""
|
|
|
|
@mock.patch('requests.get')
|
|
def test_get_transcript_link_from_youtube(self, mock_get):
|
|
"""
|
|
Happy path test: english caption link returned when video page HTML has one english caption
|
|
"""
|
|
language_code = 'en'
|
|
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
|
|
|
|
language_specific_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
|
self.assertEqual(language_specific_caption_link, CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code))
|
|
|
|
@ mock.patch('requests.get')
|
|
def test_get_caption_no_english_caption(self, mock_get):
|
|
"""
|
|
No caption link returned when video page HTML contains no caption in English
|
|
"""
|
|
language_code = 'fr'
|
|
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)
|
|
|
|
english_language_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
|
self.assertIsNone(english_language_caption_link)
|
|
|
|
@ mock.patch('requests.get')
|
|
def test_get_caption_no_captions_in_HTML(self, mock_get):
|
|
"""
|
|
No caption link returned when video page HTML contains no captions at all
|
|
"""
|
|
mock_get.return_value = YoutubeVideoHTMLResponse.with_no_caption_tracks()
|
|
|
|
english_language_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
|
self.assertEqual(english_language_caption_link, None)
|
|
|
|
@ mock.patch('requests.get')
|
|
def test_get_caption_malformed_caption_locator(self, mock_get):
|
|
"""
|
|
Caption track provided on video page for the selected language, but with broken syntax
|
|
"""
|
|
language_code = 'en'
|
|
mock_get.return_value = YoutubeVideoHTMLResponse.with_malformed_caption_track(language_code)
|
|
|
|
english_language_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
|
|
self.assertIsNone(english_language_caption_link)
|