diff --git a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py index 15a17faa0f..ea56394bd6 100644 --- a/cms/djangoapps/contentstore/tests/test_transcripts_utils.py +++ b/cms/djangoapps/contentstore/tests/test_transcripts_utils.py @@ -1,5 +1,6 @@ """ Tests for transcripts_utils. """ +from contextlib import contextmanager import copy import json import re @@ -1000,3 +1001,116 @@ class TestGetTranscript(SharedModuleStoreTestCase): output_format=transcripts_utils.Transcript.SRT, transcripts_info=transcripts_info ) + + +@ddt.ddt +class TestResolveLanguageCodeToTranscriptCode(unittest.TestCase): + """ Tests for resolve_language_code_to_transcript_code """ + TEST_OTHER_LANGS = {'ab': 1, 'ab-cd': 1, 'ab-EF': 1, 'cd': 1, 'cd-jk': 1} + TEST_TRANSCRIPTS = {'transcripts': TEST_OTHER_LANGS, 'sub': False} + + @ddt.unpack + @ddt.data( + ('ab', 'ab'), + ('ab-CD', 'ab-cd'), + ('ab-ef', 'ab-EF'), + ('zx', None), + ('cd-lmao', 'cd'), + ) + def test_resolve_lang(self, lang, expected): + """ + Test that resolve_language_code_to_transcript_code will successfully match + language codes of different cases, and return None if it isn't found + """ + self.assertEqual( + transcripts_utils.resolve_language_code_to_transcript_code(self.TEST_TRANSCRIPTS, lang), + expected + ) + + +class TestGetEndonymOrLabel(unittest.TestCase): + """ + tests for the get_endonym_or_label function + """ + LANG_CODE = 'ab-cd' + GENERIC_CODE = 'ab' + LANG_ENTONYM = 'ab language entonym (cd)' + LANG_LABEL = 'ab-cd language english label' + GENERIC_LABEL = 'ab language english label' + + TEST_LANGUAGE_DICT = {LANG_CODE: LANG_ENTONYM} + TEST_ALL_LANGUAGES = ( + ["aa", "Afar"], + [GENERIC_CODE, GENERIC_LABEL], + [LANG_CODE, LANG_LABEL], + ["ur", "Urdu"], + ) + + @contextmanager + def mock_django_get_language_info(self, side_effect=None): + """ + Helper for cleaner mocking + """ + with patch('xmodule.video_block.transcripts_utils.get_language_info') as mock_get: + if side_effect: + mock_get.side_effect = side_effect + yield mock_get + + def test_language_in_languages(self): + """ If language is found in LANGUAGE_DICT that value should be returned """ + with override_settings(LANGUAGE_DICT=self.TEST_LANGUAGE_DICT): + self.assertEqual( + transcripts_utils.get_endonym_or_label(self.LANG_CODE), + self.LANG_ENTONYM + ) + + def test_language_in_django_lang_info(self): + """ + If language is not found in LANGUAGE_DICT, check get_language_info and return that + local name if found + """ + with override_settings(LANGUAGE_DICT={}): + with self.mock_django_get_language_info() as mock_get_language_info: + self.assertEqual( + transcripts_utils.get_endonym_or_label(self.LANG_CODE), + mock_get_language_info.return_value['name_local'] + ) + + def test_language_exact_in_all_languages(self): + """ + If not found in LANGUAGE_DICT or get_language_info, check in + ALL_LANGUAGES for the English language name + """ + with override_settings(LANGUAGE_DICT={}): + with self.mock_django_get_language_info(side_effect=KeyError): + with override_settings(ALL_LANGUAGES=self.TEST_ALL_LANGUAGES): + label = transcripts_utils.get_endonym_or_label(self.LANG_CODE) + self.assertEqual(label, self.LANG_LABEL) + + def test_language_generic_in_all_languages(self): + """ + If not found in LANGUAGE_DICT or get_language_info, and the exact code + wasn't found in ALL_LANGUAGES, use the generic code if it is found in ALL_LANGUAGES. + """ + all_languages = ( + self.TEST_ALL_LANGUAGES[0], + self.TEST_ALL_LANGUAGES[1], + self.TEST_ALL_LANGUAGES[3] + ) + + with override_settings(LANGUAGE_DICT={}): + with self.mock_django_get_language_info(side_effect=KeyError): + with override_settings(ALL_LANGUAGES=all_languages): + label = transcripts_utils.get_endonym_or_label(self.LANG_CODE) + self.assertEqual(label, self.GENERIC_LABEL) + + def test_language_not_found_anywhere(self): + """ + Raise a NotFoundError if the language isn't found anywhere + """ + all_languages = (self.TEST_ALL_LANGUAGES[0], self.TEST_ALL_LANGUAGES[3]) + with override_settings(LANGUAGE_DICT={}): + with self.mock_django_get_language_info(side_effect=KeyError): + with override_settings(ALL_LANGUAGES=all_languages): + with self.assertRaises(NotFoundError): + transcripts_utils.get_endonym_or_label(self.LANG_CODE) diff --git a/xmodule/video_block/transcripts_utils.py b/xmodule/video_block/transcripts_utils.py index c08f9e4696..1e2204d1da 100644 --- a/xmodule/video_block/transcripts_utils.py +++ b/xmodule/video_block/transcripts_utils.py @@ -16,6 +16,7 @@ import requests import simplejson as json from django.conf import settings from django.core.exceptions import ObjectDoesNotExist +from django.utils.translation import get_language_info from lxml import etree from opaque_keys.edx.keys import UsageKeyV2 from pysrt import SubRipFile, SubRipItem, SubRipTime @@ -883,21 +884,24 @@ class VideoTranscriptsMixin: """ sub, other_lang = transcripts["sub"], transcripts["transcripts"] - # language in plugin selector exists as transcript - if dest_lang and dest_lang in other_lang.keys(): - transcript_language = dest_lang - # language in plugin selector is english and empty transcripts or transcripts and sub exists - elif dest_lang and dest_lang == 'en' and (not other_lang or (other_lang and sub)): - transcript_language = 'en' - elif self.transcript_language in other_lang: - transcript_language = self.transcript_language - elif sub: - transcript_language = 'en' - elif len(other_lang) > 0: - transcript_language = sorted(other_lang)[0] - else: - transcript_language = 'en' - return transcript_language + if dest_lang: + resolved_transcript_dest_lang = resolve_language_code_to_transcript_code(transcripts, dest_lang) + if resolved_transcript_dest_lang: + return resolved_transcript_dest_lang + # language in plugin selector is english and empty transcripts or transcripts and sub exists + if dest_lang == 'en' and (not other_lang or (other_lang and sub)): + return 'en' + + if self.transcript_language in other_lang: + return self.transcript_language + + if sub: + return 'en' + + if len(other_lang) > 0: + return sorted(other_lang)[0] + + return 'en' def get_transcripts_info(self, is_bumper=False): """ @@ -1199,3 +1203,77 @@ def get_transcript(video, lang=None, output_format=Transcript.SRT, youtube_id=No output_format=output_format, transcripts_info=transcripts_info ) + + +def resolve_language_code_to_transcript_code(transcripts, dest_lang): + """ + Attempts to match the requested dest lang with the existing transcript languages + """ + sub, other_lang = transcripts["sub"], transcripts["transcripts"] + # lang code exists in list of other transcript languages as-is + if dest_lang in other_lang: + return dest_lang + + # Language codes can be base languages, 2-3 characters, or they can include a + # locale (`fr` for french, `fr-ca` for canadian french). Sometimes the part after the + # dash is capitalized, sometimes it is not. Check both variants. + dash_index = dest_lang.find('-') + if dash_index >= 0: + lowercase_dest_lang = dest_lang.lower() + if lowercase_dest_lang in other_lang: + log.debug("language code %s resolved to %s", dest_lang, lowercase_dest_lang) + return lowercase_dest_lang + + generic_lang_code = lowercase_dest_lang[:dash_index] + uppercase_dest_lang = generic_lang_code + lowercase_dest_lang[dash_index:].upper() + if uppercase_dest_lang in other_lang: + log.debug("language code %s resolved to %s", dest_lang, uppercase_dest_lang) + return uppercase_dest_lang + + if generic_lang_code in other_lang: + log.debug("language code %s resolved to generic %s", dest_lang, generic_lang_code) + return generic_lang_code + + +def get_endonym_or_label(language_code): + """ + Given a language code, attempt to look up the endonym, or local name, for that language + """ + + lowercase_code = language_code.lower() + # LANGUAGE_DICT is an edx-configured mapping of language codes to endonym. It's a bit more + # specific than the django utility, so try that first. All language codes in this dict will + # be lowercase + if local_name := settings.LANGUAGE_DICT.get(lowercase_code): + return local_name + + # get_language_info attempts to look up language info in a hardcoded list in + # django.conf.translations. It will do automatic "generalizations", i.e. it doesn't + # have `es-419` so it then tries `es`. That's why we only do this after checking + # LANGUAGE_DICT + try: + lang_info = get_language_info(language_code) + return lang_info['name_local'] + except KeyError: + pass + + # Last place to look is in settings.ALL_LANGUAGES. Ideally we find the actual code, + # but also, check the 'generic' language. If even the generic language isn't found, + # something is wrong, so log an error and throw an exception. + first_dash_index = language_code.find('-') + generic_code = None if first_dash_index == -1 else language_code[:first_dash_index] + potential_generic_label = None + for code, language_label in settings.ALL_LANGUAGES: + # check for lowercase of the whole code, but as far as I can tell, the generic codes are + # always lowercase + if code in (language_code, lowercase_code): + return language_label + if generic_code and code == generic_code: + potential_generic_label = language_label + elif code > language_code: + break + if potential_generic_label: + return potential_generic_label + + log.error("A label was requested for language code `%s` but the code is completely unknown", language_code) + raise NotFoundError(f"Unknown language `{language_code}`") diff --git a/xmodule/video_block/video_block.py b/xmodule/video_block/video_block.py index c8807affa6..b0e01068fb 100644 --- a/xmodule/video_block/video_block.py +++ b/xmodule/video_block/video_block.py @@ -63,6 +63,7 @@ from .transcripts_utils import ( Transcript, VideoTranscriptsMixin, clean_video_id, + get_endonym_or_label, get_html5_ids, get_transcript, subs_filename @@ -182,12 +183,13 @@ class _BuiltInVideoBlock( track_url = self.runtime.handler_url(self, 'transcript', 'download').rstrip('/?') transcript_language = self.get_default_transcript_language(transcripts, dest_lang) - native_languages = {lang: label for lang, label in settings.LANGUAGES if len(lang) == 2} - languages = { - lang: native_languages.get(lang, display) - for lang, display in settings.ALL_LANGUAGES - if lang in other_lang - } + languages = {} + for lang_code in other_lang: + try: + label = get_endonym_or_label(lang_code) + languages[lang_code] = label + except NotFoundError: + continue if not other_lang or (other_lang and sub): languages['en'] = 'English'