feat: allow transcripts to work with more than two-letter language codes (#36419)

* feat: allow transcripts to work with more than two-letter language codes

* style: quality

* fixup! Merge branch 'master' into jkantor/video-transcript-codes

* fix: s/LANGUAGES_DICT/LANGUAGE_DICT/
This commit is contained in:
Jansen Kantor
2025-03-27 11:30:44 -04:00
committed by GitHub
parent f99b89c8c5
commit 5e61a1d959
3 changed files with 215 additions and 21 deletions

View File

@@ -1,5 +1,6 @@
""" Tests for transcripts_utils. """
from contextlib import contextmanager
import copy
import json
import re
@@ -1000,3 +1001,116 @@ class TestGetTranscript(SharedModuleStoreTestCase):
output_format=transcripts_utils.Transcript.SRT,
transcripts_info=transcripts_info
)
@ddt.ddt
class TestResolveLanguageCodeToTranscriptCode(unittest.TestCase):
""" Tests for resolve_language_code_to_transcript_code """
TEST_OTHER_LANGS = {'ab': 1, 'ab-cd': 1, 'ab-EF': 1, 'cd': 1, 'cd-jk': 1}
TEST_TRANSCRIPTS = {'transcripts': TEST_OTHER_LANGS, 'sub': False}
@ddt.unpack
@ddt.data(
('ab', 'ab'),
('ab-CD', 'ab-cd'),
('ab-ef', 'ab-EF'),
('zx', None),
('cd-lmao', 'cd'),
)
def test_resolve_lang(self, lang, expected):
"""
Test that resolve_language_code_to_transcript_code will successfully match
language codes of different cases, and return None if it isn't found
"""
self.assertEqual(
transcripts_utils.resolve_language_code_to_transcript_code(self.TEST_TRANSCRIPTS, lang),
expected
)
class TestGetEndonymOrLabel(unittest.TestCase):
"""
tests for the get_endonym_or_label function
"""
LANG_CODE = 'ab-cd'
GENERIC_CODE = 'ab'
LANG_ENTONYM = 'ab language entonym (cd)'
LANG_LABEL = 'ab-cd language english label'
GENERIC_LABEL = 'ab language english label'
TEST_LANGUAGE_DICT = {LANG_CODE: LANG_ENTONYM}
TEST_ALL_LANGUAGES = (
["aa", "Afar"],
[GENERIC_CODE, GENERIC_LABEL],
[LANG_CODE, LANG_LABEL],
["ur", "Urdu"],
)
@contextmanager
def mock_django_get_language_info(self, side_effect=None):
"""
Helper for cleaner mocking
"""
with patch('xmodule.video_block.transcripts_utils.get_language_info') as mock_get:
if side_effect:
mock_get.side_effect = side_effect
yield mock_get
def test_language_in_languages(self):
""" If language is found in LANGUAGE_DICT that value should be returned """
with override_settings(LANGUAGE_DICT=self.TEST_LANGUAGE_DICT):
self.assertEqual(
transcripts_utils.get_endonym_or_label(self.LANG_CODE),
self.LANG_ENTONYM
)
def test_language_in_django_lang_info(self):
"""
If language is not found in LANGUAGE_DICT, check get_language_info and return that
local name if found
"""
with override_settings(LANGUAGE_DICT={}):
with self.mock_django_get_language_info() as mock_get_language_info:
self.assertEqual(
transcripts_utils.get_endonym_or_label(self.LANG_CODE),
mock_get_language_info.return_value['name_local']
)
def test_language_exact_in_all_languages(self):
"""
If not found in LANGUAGE_DICT or get_language_info, check in
ALL_LANGUAGES for the English language name
"""
with override_settings(LANGUAGE_DICT={}):
with self.mock_django_get_language_info(side_effect=KeyError):
with override_settings(ALL_LANGUAGES=self.TEST_ALL_LANGUAGES):
label = transcripts_utils.get_endonym_or_label(self.LANG_CODE)
self.assertEqual(label, self.LANG_LABEL)
def test_language_generic_in_all_languages(self):
"""
If not found in LANGUAGE_DICT or get_language_info, and the exact code
wasn't found in ALL_LANGUAGES, use the generic code if it is found in ALL_LANGUAGES.
"""
all_languages = (
self.TEST_ALL_LANGUAGES[0],
self.TEST_ALL_LANGUAGES[1],
self.TEST_ALL_LANGUAGES[3]
)
with override_settings(LANGUAGE_DICT={}):
with self.mock_django_get_language_info(side_effect=KeyError):
with override_settings(ALL_LANGUAGES=all_languages):
label = transcripts_utils.get_endonym_or_label(self.LANG_CODE)
self.assertEqual(label, self.GENERIC_LABEL)
def test_language_not_found_anywhere(self):
"""
Raise a NotFoundError if the language isn't found anywhere
"""
all_languages = (self.TEST_ALL_LANGUAGES[0], self.TEST_ALL_LANGUAGES[3])
with override_settings(LANGUAGE_DICT={}):
with self.mock_django_get_language_info(side_effect=KeyError):
with override_settings(ALL_LANGUAGES=all_languages):
with self.assertRaises(NotFoundError):
transcripts_utils.get_endonym_or_label(self.LANG_CODE)

View File

@@ -16,6 +16,7 @@ import requests
import simplejson as json
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.utils.translation import get_language_info
from lxml import etree
from opaque_keys.edx.keys import UsageKeyV2
from pysrt import SubRipFile, SubRipItem, SubRipTime
@@ -883,21 +884,24 @@ class VideoTranscriptsMixin:
"""
sub, other_lang = transcripts["sub"], transcripts["transcripts"]
# language in plugin selector exists as transcript
if dest_lang and dest_lang in other_lang.keys():
transcript_language = dest_lang
# language in plugin selector is english and empty transcripts or transcripts and sub exists
elif dest_lang and dest_lang == 'en' and (not other_lang or (other_lang and sub)):
transcript_language = 'en'
elif self.transcript_language in other_lang:
transcript_language = self.transcript_language
elif sub:
transcript_language = 'en'
elif len(other_lang) > 0:
transcript_language = sorted(other_lang)[0]
else:
transcript_language = 'en'
return transcript_language
if dest_lang:
resolved_transcript_dest_lang = resolve_language_code_to_transcript_code(transcripts, dest_lang)
if resolved_transcript_dest_lang:
return resolved_transcript_dest_lang
# language in plugin selector is english and empty transcripts or transcripts and sub exists
if dest_lang == 'en' and (not other_lang or (other_lang and sub)):
return 'en'
if self.transcript_language in other_lang:
return self.transcript_language
if sub:
return 'en'
if len(other_lang) > 0:
return sorted(other_lang)[0]
return 'en'
def get_transcripts_info(self, is_bumper=False):
"""
@@ -1199,3 +1203,77 @@ def get_transcript(video, lang=None, output_format=Transcript.SRT, youtube_id=No
output_format=output_format,
transcripts_info=transcripts_info
)
def resolve_language_code_to_transcript_code(transcripts, dest_lang):
"""
Attempts to match the requested dest lang with the existing transcript languages
"""
sub, other_lang = transcripts["sub"], transcripts["transcripts"]
# lang code exists in list of other transcript languages as-is
if dest_lang in other_lang:
return dest_lang
# Language codes can be base languages, 2-3 characters, or they can include a
# locale (`fr` for french, `fr-ca` for canadian french). Sometimes the part after the
# dash is capitalized, sometimes it is not. Check both variants.
dash_index = dest_lang.find('-')
if dash_index >= 0:
lowercase_dest_lang = dest_lang.lower()
if lowercase_dest_lang in other_lang:
log.debug("language code %s resolved to %s", dest_lang, lowercase_dest_lang)
return lowercase_dest_lang
generic_lang_code = lowercase_dest_lang[:dash_index]
uppercase_dest_lang = generic_lang_code + lowercase_dest_lang[dash_index:].upper()
if uppercase_dest_lang in other_lang:
log.debug("language code %s resolved to %s", dest_lang, uppercase_dest_lang)
return uppercase_dest_lang
if generic_lang_code in other_lang:
log.debug("language code %s resolved to generic %s", dest_lang, generic_lang_code)
return generic_lang_code
def get_endonym_or_label(language_code):
"""
Given a language code, attempt to look up the endonym, or local name, for that language
"""
lowercase_code = language_code.lower()
# LANGUAGE_DICT is an edx-configured mapping of language codes to endonym. It's a bit more
# specific than the django utility, so try that first. All language codes in this dict will
# be lowercase
if local_name := settings.LANGUAGE_DICT.get(lowercase_code):
return local_name
# get_language_info attempts to look up language info in a hardcoded list in
# django.conf.translations. It will do automatic "generalizations", i.e. it doesn't
# have `es-419` so it then tries `es`. That's why we only do this after checking
# LANGUAGE_DICT
try:
lang_info = get_language_info(language_code)
return lang_info['name_local']
except KeyError:
pass
# Last place to look is in settings.ALL_LANGUAGES. Ideally we find the actual code,
# but also, check the 'generic' language. If even the generic language isn't found,
# something is wrong, so log an error and throw an exception.
first_dash_index = language_code.find('-')
generic_code = None if first_dash_index == -1 else language_code[:first_dash_index]
potential_generic_label = None
for code, language_label in settings.ALL_LANGUAGES:
# check for lowercase of the whole code, but as far as I can tell, the generic codes are
# always lowercase
if code in (language_code, lowercase_code):
return language_label
if generic_code and code == generic_code:
potential_generic_label = language_label
elif code > language_code:
break
if potential_generic_label:
return potential_generic_label
log.error("A label was requested for language code `%s` but the code is completely unknown", language_code)
raise NotFoundError(f"Unknown language `{language_code}`")

View File

@@ -63,6 +63,7 @@ from .transcripts_utils import (
Transcript,
VideoTranscriptsMixin,
clean_video_id,
get_endonym_or_label,
get_html5_ids,
get_transcript,
subs_filename
@@ -182,12 +183,13 @@ class _BuiltInVideoBlock(
track_url = self.runtime.handler_url(self, 'transcript', 'download').rstrip('/?')
transcript_language = self.get_default_transcript_language(transcripts, dest_lang)
native_languages = {lang: label for lang, label in settings.LANGUAGES if len(lang) == 2}
languages = {
lang: native_languages.get(lang, display)
for lang, display in settings.ALL_LANGUAGES
if lang in other_lang
}
languages = {}
for lang_code in other_lang:
try:
label = get_endonym_or_label(lang_code)
languages[lang_code] = label
except NotFoundError:
continue
if not other_lang or (other_lang and sub):
languages['en'] = 'English'