feat: allow transcripts to work with more than two-letter language codes (#36419)
* feat: allow transcripts to work with more than two-letter language codes * style: quality * fixup! Merge branch 'master' into jkantor/video-transcript-codes * fix: s/LANGUAGES_DICT/LANGUAGE_DICT/
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
""" Tests for transcripts_utils. """
|
||||
|
||||
from contextlib import contextmanager
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
@@ -1000,3 +1001,116 @@ class TestGetTranscript(SharedModuleStoreTestCase):
|
||||
output_format=transcripts_utils.Transcript.SRT,
|
||||
transcripts_info=transcripts_info
|
||||
)
|
||||
|
||||
|
||||
@ddt.ddt
|
||||
class TestResolveLanguageCodeToTranscriptCode(unittest.TestCase):
|
||||
""" Tests for resolve_language_code_to_transcript_code """
|
||||
TEST_OTHER_LANGS = {'ab': 1, 'ab-cd': 1, 'ab-EF': 1, 'cd': 1, 'cd-jk': 1}
|
||||
TEST_TRANSCRIPTS = {'transcripts': TEST_OTHER_LANGS, 'sub': False}
|
||||
|
||||
@ddt.unpack
|
||||
@ddt.data(
|
||||
('ab', 'ab'),
|
||||
('ab-CD', 'ab-cd'),
|
||||
('ab-ef', 'ab-EF'),
|
||||
('zx', None),
|
||||
('cd-lmao', 'cd'),
|
||||
)
|
||||
def test_resolve_lang(self, lang, expected):
|
||||
"""
|
||||
Test that resolve_language_code_to_transcript_code will successfully match
|
||||
language codes of different cases, and return None if it isn't found
|
||||
"""
|
||||
self.assertEqual(
|
||||
transcripts_utils.resolve_language_code_to_transcript_code(self.TEST_TRANSCRIPTS, lang),
|
||||
expected
|
||||
)
|
||||
|
||||
|
||||
class TestGetEndonymOrLabel(unittest.TestCase):
|
||||
"""
|
||||
tests for the get_endonym_or_label function
|
||||
"""
|
||||
LANG_CODE = 'ab-cd'
|
||||
GENERIC_CODE = 'ab'
|
||||
LANG_ENTONYM = 'ab language entonym (cd)'
|
||||
LANG_LABEL = 'ab-cd language english label'
|
||||
GENERIC_LABEL = 'ab language english label'
|
||||
|
||||
TEST_LANGUAGE_DICT = {LANG_CODE: LANG_ENTONYM}
|
||||
TEST_ALL_LANGUAGES = (
|
||||
["aa", "Afar"],
|
||||
[GENERIC_CODE, GENERIC_LABEL],
|
||||
[LANG_CODE, LANG_LABEL],
|
||||
["ur", "Urdu"],
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def mock_django_get_language_info(self, side_effect=None):
|
||||
"""
|
||||
Helper for cleaner mocking
|
||||
"""
|
||||
with patch('xmodule.video_block.transcripts_utils.get_language_info') as mock_get:
|
||||
if side_effect:
|
||||
mock_get.side_effect = side_effect
|
||||
yield mock_get
|
||||
|
||||
def test_language_in_languages(self):
|
||||
""" If language is found in LANGUAGE_DICT that value should be returned """
|
||||
with override_settings(LANGUAGE_DICT=self.TEST_LANGUAGE_DICT):
|
||||
self.assertEqual(
|
||||
transcripts_utils.get_endonym_or_label(self.LANG_CODE),
|
||||
self.LANG_ENTONYM
|
||||
)
|
||||
|
||||
def test_language_in_django_lang_info(self):
|
||||
"""
|
||||
If language is not found in LANGUAGE_DICT, check get_language_info and return that
|
||||
local name if found
|
||||
"""
|
||||
with override_settings(LANGUAGE_DICT={}):
|
||||
with self.mock_django_get_language_info() as mock_get_language_info:
|
||||
self.assertEqual(
|
||||
transcripts_utils.get_endonym_or_label(self.LANG_CODE),
|
||||
mock_get_language_info.return_value['name_local']
|
||||
)
|
||||
|
||||
def test_language_exact_in_all_languages(self):
|
||||
"""
|
||||
If not found in LANGUAGE_DICT or get_language_info, check in
|
||||
ALL_LANGUAGES for the English language name
|
||||
"""
|
||||
with override_settings(LANGUAGE_DICT={}):
|
||||
with self.mock_django_get_language_info(side_effect=KeyError):
|
||||
with override_settings(ALL_LANGUAGES=self.TEST_ALL_LANGUAGES):
|
||||
label = transcripts_utils.get_endonym_or_label(self.LANG_CODE)
|
||||
self.assertEqual(label, self.LANG_LABEL)
|
||||
|
||||
def test_language_generic_in_all_languages(self):
|
||||
"""
|
||||
If not found in LANGUAGE_DICT or get_language_info, and the exact code
|
||||
wasn't found in ALL_LANGUAGES, use the generic code if it is found in ALL_LANGUAGES.
|
||||
"""
|
||||
all_languages = (
|
||||
self.TEST_ALL_LANGUAGES[0],
|
||||
self.TEST_ALL_LANGUAGES[1],
|
||||
self.TEST_ALL_LANGUAGES[3]
|
||||
)
|
||||
|
||||
with override_settings(LANGUAGE_DICT={}):
|
||||
with self.mock_django_get_language_info(side_effect=KeyError):
|
||||
with override_settings(ALL_LANGUAGES=all_languages):
|
||||
label = transcripts_utils.get_endonym_or_label(self.LANG_CODE)
|
||||
self.assertEqual(label, self.GENERIC_LABEL)
|
||||
|
||||
def test_language_not_found_anywhere(self):
|
||||
"""
|
||||
Raise a NotFoundError if the language isn't found anywhere
|
||||
"""
|
||||
all_languages = (self.TEST_ALL_LANGUAGES[0], self.TEST_ALL_LANGUAGES[3])
|
||||
with override_settings(LANGUAGE_DICT={}):
|
||||
with self.mock_django_get_language_info(side_effect=KeyError):
|
||||
with override_settings(ALL_LANGUAGES=all_languages):
|
||||
with self.assertRaises(NotFoundError):
|
||||
transcripts_utils.get_endonym_or_label(self.LANG_CODE)
|
||||
|
||||
@@ -16,6 +16,7 @@ import requests
|
||||
import simplejson as json
|
||||
from django.conf import settings
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
from django.utils.translation import get_language_info
|
||||
from lxml import etree
|
||||
from opaque_keys.edx.keys import UsageKeyV2
|
||||
from pysrt import SubRipFile, SubRipItem, SubRipTime
|
||||
@@ -883,21 +884,24 @@ class VideoTranscriptsMixin:
|
||||
"""
|
||||
sub, other_lang = transcripts["sub"], transcripts["transcripts"]
|
||||
|
||||
# language in plugin selector exists as transcript
|
||||
if dest_lang and dest_lang in other_lang.keys():
|
||||
transcript_language = dest_lang
|
||||
# language in plugin selector is english and empty transcripts or transcripts and sub exists
|
||||
elif dest_lang and dest_lang == 'en' and (not other_lang or (other_lang and sub)):
|
||||
transcript_language = 'en'
|
||||
elif self.transcript_language in other_lang:
|
||||
transcript_language = self.transcript_language
|
||||
elif sub:
|
||||
transcript_language = 'en'
|
||||
elif len(other_lang) > 0:
|
||||
transcript_language = sorted(other_lang)[0]
|
||||
else:
|
||||
transcript_language = 'en'
|
||||
return transcript_language
|
||||
if dest_lang:
|
||||
resolved_transcript_dest_lang = resolve_language_code_to_transcript_code(transcripts, dest_lang)
|
||||
if resolved_transcript_dest_lang:
|
||||
return resolved_transcript_dest_lang
|
||||
# language in plugin selector is english and empty transcripts or transcripts and sub exists
|
||||
if dest_lang == 'en' and (not other_lang or (other_lang and sub)):
|
||||
return 'en'
|
||||
|
||||
if self.transcript_language in other_lang:
|
||||
return self.transcript_language
|
||||
|
||||
if sub:
|
||||
return 'en'
|
||||
|
||||
if len(other_lang) > 0:
|
||||
return sorted(other_lang)[0]
|
||||
|
||||
return 'en'
|
||||
|
||||
def get_transcripts_info(self, is_bumper=False):
|
||||
"""
|
||||
@@ -1199,3 +1203,77 @@ def get_transcript(video, lang=None, output_format=Transcript.SRT, youtube_id=No
|
||||
output_format=output_format,
|
||||
transcripts_info=transcripts_info
|
||||
)
|
||||
|
||||
|
||||
def resolve_language_code_to_transcript_code(transcripts, dest_lang):
|
||||
"""
|
||||
Attempts to match the requested dest lang with the existing transcript languages
|
||||
"""
|
||||
sub, other_lang = transcripts["sub"], transcripts["transcripts"]
|
||||
# lang code exists in list of other transcript languages as-is
|
||||
if dest_lang in other_lang:
|
||||
return dest_lang
|
||||
|
||||
# Language codes can be base languages, 2-3 characters, or they can include a
|
||||
# locale (`fr` for french, `fr-ca` for canadian french). Sometimes the part after the
|
||||
# dash is capitalized, sometimes it is not. Check both variants.
|
||||
dash_index = dest_lang.find('-')
|
||||
if dash_index >= 0:
|
||||
lowercase_dest_lang = dest_lang.lower()
|
||||
if lowercase_dest_lang in other_lang:
|
||||
log.debug("language code %s resolved to %s", dest_lang, lowercase_dest_lang)
|
||||
return lowercase_dest_lang
|
||||
|
||||
generic_lang_code = lowercase_dest_lang[:dash_index]
|
||||
uppercase_dest_lang = generic_lang_code + lowercase_dest_lang[dash_index:].upper()
|
||||
if uppercase_dest_lang in other_lang:
|
||||
log.debug("language code %s resolved to %s", dest_lang, uppercase_dest_lang)
|
||||
return uppercase_dest_lang
|
||||
|
||||
if generic_lang_code in other_lang:
|
||||
log.debug("language code %s resolved to generic %s", dest_lang, generic_lang_code)
|
||||
return generic_lang_code
|
||||
|
||||
|
||||
def get_endonym_or_label(language_code):
|
||||
"""
|
||||
Given a language code, attempt to look up the endonym, or local name, for that language
|
||||
"""
|
||||
|
||||
lowercase_code = language_code.lower()
|
||||
# LANGUAGE_DICT is an edx-configured mapping of language codes to endonym. It's a bit more
|
||||
# specific than the django utility, so try that first. All language codes in this dict will
|
||||
# be lowercase
|
||||
if local_name := settings.LANGUAGE_DICT.get(lowercase_code):
|
||||
return local_name
|
||||
|
||||
# get_language_info attempts to look up language info in a hardcoded list in
|
||||
# django.conf.translations. It will do automatic "generalizations", i.e. it doesn't
|
||||
# have `es-419` so it then tries `es`. That's why we only do this after checking
|
||||
# LANGUAGE_DICT
|
||||
try:
|
||||
lang_info = get_language_info(language_code)
|
||||
return lang_info['name_local']
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
# Last place to look is in settings.ALL_LANGUAGES. Ideally we find the actual code,
|
||||
# but also, check the 'generic' language. If even the generic language isn't found,
|
||||
# something is wrong, so log an error and throw an exception.
|
||||
first_dash_index = language_code.find('-')
|
||||
generic_code = None if first_dash_index == -1 else language_code[:first_dash_index]
|
||||
potential_generic_label = None
|
||||
for code, language_label in settings.ALL_LANGUAGES:
|
||||
# check for lowercase of the whole code, but as far as I can tell, the generic codes are
|
||||
# always lowercase
|
||||
if code in (language_code, lowercase_code):
|
||||
return language_label
|
||||
if generic_code and code == generic_code:
|
||||
potential_generic_label = language_label
|
||||
elif code > language_code:
|
||||
break
|
||||
if potential_generic_label:
|
||||
return potential_generic_label
|
||||
|
||||
log.error("A label was requested for language code `%s` but the code is completely unknown", language_code)
|
||||
raise NotFoundError(f"Unknown language `{language_code}`")
|
||||
|
||||
@@ -63,6 +63,7 @@ from .transcripts_utils import (
|
||||
Transcript,
|
||||
VideoTranscriptsMixin,
|
||||
clean_video_id,
|
||||
get_endonym_or_label,
|
||||
get_html5_ids,
|
||||
get_transcript,
|
||||
subs_filename
|
||||
@@ -182,12 +183,13 @@ class _BuiltInVideoBlock(
|
||||
track_url = self.runtime.handler_url(self, 'transcript', 'download').rstrip('/?')
|
||||
|
||||
transcript_language = self.get_default_transcript_language(transcripts, dest_lang)
|
||||
native_languages = {lang: label for lang, label in settings.LANGUAGES if len(lang) == 2}
|
||||
languages = {
|
||||
lang: native_languages.get(lang, display)
|
||||
for lang, display in settings.ALL_LANGUAGES
|
||||
if lang in other_lang
|
||||
}
|
||||
languages = {}
|
||||
for lang_code in other_lang:
|
||||
try:
|
||||
label = get_endonym_or_label(lang_code)
|
||||
languages[lang_code] = label
|
||||
except NotFoundError:
|
||||
continue
|
||||
|
||||
if not other_lang or (other_lang and sub):
|
||||
languages['en'] = 'English'
|
||||
|
||||
Reference in New Issue
Block a user