From 27b8c83b346d87ca69c16f131068a864191135c3 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Mon, 23 Dec 2013 15:26:44 -0500 Subject: [PATCH 1/8] Mark XModule strings for i18n --- common/lib/xmodule/xmodule/js/src/capa/display.coffee | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/lib/xmodule/xmodule/js/src/capa/display.coffee b/common/lib/xmodule/xmodule/js/src/capa/display.coffee index 91b83e21f3..f65860b7ab 100644 --- a/common/lib/xmodule/xmodule/js/src/capa/display.coffee +++ b/common/lib/xmodule/xmodule/js/src/capa/display.coffee @@ -405,7 +405,7 @@ class @Problem formulaequationinput: (element) -> $(element).find('input').on 'input', -> $p = $(element).find('p.status') - `// Translators: the word Answer here is about answering a problem the student must solve.` + `// Translators: the word unanswered here is about answering a problem the student must solve.` $p.text gettext("unanswered") $p.parent().removeClass().addClass "unanswered" @@ -434,7 +434,7 @@ class @Problem textline: (element) -> $(element).find('input').on 'input', -> $p = $(element).find('p.status') - `// Translators: the word Answer here is about answering a problem the student must solve.` + `// Translators: the word unanswered here is about answering a problem the student must solve.` $p.text gettext("unanswered") $p.parent().removeClass().addClass "unanswered" From 6d5e13e24548d3ab29b8f350b7ceabe006c0ac2f Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Mon, 23 Dec 2013 15:27:02 -0500 Subject: [PATCH 2/8] Use the same marker everywhere for translator comments. --- i18n/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/i18n/extract.py b/i18n/extract.py index 2bb1baf60d..694f1740e4 100755 --- a/i18n/extract.py +++ b/i18n/extract.py @@ -45,7 +45,7 @@ def main(): remove_file(source_msgs_dir.joinpath(filename)) # Extract strings from mako templates. - babel_mako_cmd = 'pybabel extract -F %s -c "TRANSLATORS:" . -o %s' % (BABEL_CONFIG, BABEL_OUT) + babel_mako_cmd = 'pybabel extract -F %s -c "Translators:" . -o %s' % (BABEL_CONFIG, BABEL_OUT) # Extract strings from django source files. make_django_cmd = ( From 209f8cc3dbc8c227f0e37cc4e9ab3f1cc6063e27 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 27 Dec 2013 16:34:43 -0500 Subject: [PATCH 3/8] Lorem is now fancy, and HTML tags are recognized with single-quote attributes. --- i18n/converter.py | 6 +- i18n/dummy.py | 106 ++++++++++++++++++++--------------- i18n/tests/test_converter.py | 3 + i18n/tests/test_dummy.py | 40 ++++++++----- lms/templates/login.html | 2 +- 5 files changed, 94 insertions(+), 63 deletions(-) diff --git a/i18n/converter.py b/i18n/converter.py index d3987bebe2..9a982347ee 100644 --- a/i18n/converter.py +++ b/i18n/converter.py @@ -21,9 +21,9 @@ class Converter(object): # HTML: , ,
, # Python: %(date)s, %(name)s tag_pattern = re.compile(r''' - (<[-\w" .:?=/]*>) | # - ({[^}]*}) | # {tag} - (%\([^)]*\)\w) | # %(tag)s + (<[^>]+>) | # + ({[^}]+}) | # {tag} + (%\([\w]+\)\w) | # %(tag)s (&\w+;) | # &entity; (&\#\d+;) | # Ӓ (&\#x[0-9a-f]+;) # ꯍ diff --git a/i18n/dummy.py b/i18n/dummy.py index e82429dcbd..987c971447 100644 --- a/i18n/dummy.py +++ b/i18n/dummy.py @@ -1,56 +1,70 @@ +# -*- coding: utf-8 -*- +r""" +Creates new localization properties files in a dummy language. + +Each property file is derived from the equivalent en_US file, with these +transformations applied: + +1. Every vowel is replaced with an equivalent with extra accent marks. + +2. Every string is padded out to +30% length to simulate verbose languages + (such as German) to see if layout and flows work properly. + +3. Every string is terminated with a '#' character to make it easier to detect + truncation. + +Example use:: + + >>> from dummy import Dummy + >>> c = Dummy() + >>> c.convert("My name is Bond, James Bond") + u'M\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd \u2360\u03c3\u044f\u0454\u043c \u03b9\u03c1#' + >>> print c.convert("My name is Bond, James Bond") + Mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ιρ# + >>> print c.convert("don't convert tag ids") + døn't çønvért täg ïds Ⱡσяєм ιρѕυ# + >>> print c.convert("don't convert %(name)s tags on %(date)s") + døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ# + +""" + from converter import Converter -# Creates new localization properties files in a dummy language -# Each property file is derived from the equivalent en_US file, except -# 1. Every vowel is replaced with an equivalent with extra accent marks -# 2. Every string is padded out to +30% length to simulate verbose languages (e.g. German) -# to see if layout and flows work properly -# 3. Every string is terminated with a '#' character to make it easier to detect truncation - - -# -------------------------------- -# Example use: -# >>> from dummy import Dummy -# >>> c = Dummy() -# >>> c.convert("hello my name is Bond, James Bond") -# u'h\xe9ll\xf6 my n\xe4m\xe9 \xefs B\xf6nd, J\xe4m\xe9s B\xf6nd Lorem i#' -# -# >>> c.convert('don\'t convert tag ids') -# u'd\xf6n\'t \xe7\xf6nv\xe9rt t\xe4g \xefds Lorem ipsu#' -# -# >>> c.convert('don\'t convert %(name)s tags on %(date)s') -# u"d\xf6n't \xe7\xf6nv\xe9rt %(name)s t\xe4gs \xf6n %(date)s Lorem ips#" - - # Substitute plain characters with accented lookalikes. # http://tlt.its.psu.edu/suggestions/international/web/codehtml.html#accent -TABLE = {'A': u'\xC0', - 'a': u'\xE4', - 'b': u'\xDF', - 'C': u'\xc7', - 'c': u'\xE7', - 'E': u'\xC9', - 'e': u'\xE9', - 'I': U'\xCC', - 'i': u'\xEF', - 'O': u'\xD8', - 'o': u'\xF8', - 'U': u'\xDB', - 'u': u'\xFC', - 'Y': u'\xDD', - 'y': u'\xFD', - } - +TABLE = { + 'A': u'À', + 'a': u'ä', + 'b': u'ß', + 'C': u'Ç', + 'c': u'ç', + 'E': u'É', + 'e': u'é', + 'I': u'Ì', + 'i': u'ï', + 'O': u'Ø', + 'o': u'ø', + 'U': u'Û', + 'u': u'ü', + 'Y': u'Ý', + 'y': u'ý', +} # The print industry's standard dummy text, in use since the 1500s -# see http://www.lipsum.com/ -LOREM = ' Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed ' \ - 'do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad ' \ - 'minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ' \ - 'ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate ' \ - 'velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat ' \ - 'cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ' +# see http://www.lipsum.com/, then fed through a "fancy-text" converter. +# The string should start with a space. +LOREM = " " + " ".join( # join and split just make the string easier here. + u""" + Ⱡσяєм ιρѕυм ∂σłσя ѕιт αмєт, ¢σηѕє¢тєтυя α∂ιριѕι¢ιηg єłιт, ѕє∂ ∂σ єιυѕмσ∂ + тємρσя ιη¢ι∂ι∂υηт υт łαвσяє єт ∂σłσяє мαgηα αłιqυα. υт єηιм α∂ мιηιм + νєηιαм, qυιѕ ησѕтяυ∂ єχєя¢ιтαтιση υłłαм¢σ łαвσяιѕ ηιѕι υт αłιqυιρ єχ єα + ¢σммσ∂σ ¢σηѕєqυαт. ∂υιѕ αυтє ιяυяє ∂σłσя ιη яєρяєнєη∂єяιт ιη νσłυρтαтє + νєłιт єѕѕє ¢ιłłυм ∂σłσяє єυ ƒυgιαт ηυłłα ραяιαтυя. єχ¢єρтєυя ѕιηт σ¢¢αє¢αт + ¢υρι∂αтαт ηση ρяσι∂єηт, ѕυηт ιη ¢υłρα qυι σƒƒι¢ια ∂єѕєяυηт мσłłιт αηιм ι∂ + єѕт łαвσяυм. + """.split() +) # To simulate more verbose languages (like German), pad the length of a string # by a multiple of PAD_FACTOR diff --git a/i18n/tests/test_converter.py b/i18n/tests/test_converter.py index b1989ede94..f2fec593d4 100644 --- a/i18n/tests/test_converter.py +++ b/i18n/tests/test_converter.py @@ -29,6 +29,9 @@ class TestConverter(TestCase): ('big bad wolf', 'BIG BAD WOLF'), # two html tags ('big bad wolf', 'BIG BAD WOLF'), + # html tags with attributes + ('bar baz', 'BAR BAZ'), + ("bar baz", "BAR BAZ"), # one python tag ('big %(adjective)s wolf', 'BIG %(adjective)s WOLF'), # two python tags diff --git a/i18n/tests/test_dummy.py b/i18n/tests/test_dummy.py index 4670fe5635..fbef3910ab 100644 --- a/i18n/tests/test_dummy.py +++ b/i18n/tests/test_dummy.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import os, string, random from unittest import TestCase from polib import POEntry @@ -13,39 +14,52 @@ class TestDummy(TestCase): def setUp(self): self.converter = dummy.Dummy() + def assertUnicodeEquals(self, str1, str2): + """Just like assertEquals, but doesn't put Unicode into the fail message. + + Either nose, or rake, or something, deals very badly with unusual + Unicode characters in the assertions, so we use repr here to keep + things safe. + + """ + self.assertEquals( + str1, str2, + "Mismatch: %r != %r" % (str1, str2), + ) + def test_dummy(self): """ Tests with a dummy converter (adds spurious accents to strings). Assert that embedded HTML and python tags are not converted. """ test_cases = [ - ("hello my name is Bond, James Bond", - u'h\xe9ll\xf8 m\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd Lorem i#'), + (u"hello my name is Bond, James Bond", + u"héllø mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ι#"), - ('don\'t convert tag ids', - u'd\xf8n\'t \xe7\xf8nv\xe9rt t\xe4g \xefds Lorem ipsu#'), + (u"don't convert tag ids", + u"døn't çønvért täg ïds Ⱡσяєм ιρѕυ#"), - ('don\'t convert %(name)s tags on %(date)s', - u"d\xf8n't \xe7\xf8nv\xe9rt %(name)s t\xe4gs \xf8n %(date)s Lorem ips#") + (u"don't convert %(name)s tags on %(date)s", + u"døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#"), ] for source, expected in test_cases: result = self.converter.convert(source) - self.assertEquals(result, expected) + self.assertUnicodeEquals(result, expected) def test_singular(self): entry = POEntry() entry.msgid = 'A lovely day for a cup of tea.' - expected = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#' + expected = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#' self.converter.convert_msg(entry) - self.assertEquals(entry.msgstr, expected) + self.assertUnicodeEquals(entry.msgstr, expected) def test_plural(self): entry = POEntry() entry.msgid = 'A lovely day for a cup of tea.' entry.msgid_plural = 'A lovely day for some cups of tea.' - expected_s = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#' - expected_p = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r s\xf8m\xe9 \xe7\xfcps \xf8f t\xe9\xe4. Lorem ip#' + expected_s = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#' + expected_p = u'À løvélý däý før sømé çüps øf téä. Ⱡσяєм ιρ#' self.converter.convert_msg(entry) result = entry.msgstr_plural - self.assertEquals(result['0'], expected_s) - self.assertEquals(result['1'], expected_p) + self.assertUnicodeEquals(result['0'], expected_s) + self.assertUnicodeEquals(result['1'], expected_p) diff --git a/lms/templates/login.html b/lms/templates/login.html index 72d903eed2..877751495b 100644 --- a/lms/templates/login.html +++ b/lms/templates/login.html @@ -89,7 +89,7 @@ $submitButton. addClass('is-disabled'). prop('disabled', true). - html(gettext('Processing your account information …')); + html("${_(u'Processing your account information…')}"); } } From d8df97aa502e50f20ab9dbba087191e2b8b0cf13 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Sat, 28 Dec 2013 21:21:25 -0500 Subject: [PATCH 4/8] Use ddt to separate test cases into tests. --- i18n/tests/test_converter.py | 66 +++++++++++++++++++++--------------- i18n/tests/test_dummy.py | 33 ++++++++++-------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/i18n/tests/test_converter.py b/i18n/tests/test_converter.py index f2fec593d4..e893f7c258 100644 --- a/i18n/tests/test_converter.py +++ b/i18n/tests/test_converter.py @@ -1,5 +1,8 @@ +"""Tests of i18n/converter.py""" + import os from unittest import TestCase +import ddt import converter @@ -11,39 +14,48 @@ class UpcaseConverter(converter.Converter): return string.upper() +@ddt.ddt class TestConverter(TestCase): """ Tests functionality of i18n/converter.py """ - def test_converter(self): + @ddt.data( + # no tags + ('big bad wolf', + 'BIG BAD WOLF'), + # one html tag + ('big bad wolf', + 'BIG BAD WOLF'), + # two html tags + ('big bad gray wolf', + 'BIG BAD GRAY WOLF'), + # html tags with attributes + ('bar baz', + 'BAR BAZ'), + ("bar baz", + "BAR BAZ"), + # one python tag + ('big %(adjective)s wolf', + 'BIG %(adjective)s WOLF'), + # two python tags + ('big %(adjective)s gray %(noun)s', + 'BIG %(adjective)s GRAY %(noun)s'), + # both kinds of tags + ('big %(adjective)s %(noun)s', + 'BIG %(adjective)s %(noun)s'), + # .format-style tags + ('The {0} barn is {1!r}.', + 'THE {0} BARN IS {1!r}.'), + # HTML entities + ('© 2013 edX,  ', + '© 2013 EDX,  '), + ) + def test_converter(self, data): """ Tests with a simple converter (converts strings to uppercase). Assert that embedded HTML and python tags are not converted. """ - c = UpcaseConverter() - test_cases = [ - # no tags - ('big bad wolf', 'BIG BAD WOLF'), - # one html tag - ('big bad wolf', 'BIG BAD WOLF'), - # two html tags - ('big bad wolf', 'BIG BAD WOLF'), - # html tags with attributes - ('bar baz', 'BAR BAZ'), - ("bar baz", "BAR BAZ"), - # one python tag - ('big %(adjective)s wolf', 'BIG %(adjective)s WOLF'), - # two python tags - ('big %(adjective)s %(noun)s', 'BIG %(adjective)s %(noun)s'), - # both kinds of tags - ('big %(adjective)s %(noun)s', - 'BIG %(adjective)s %(noun)s'), - # .format-style tags - ('The {0} barn is {1!r}.', 'THE {0} BARN IS {1!r}.'), - # HTML entities - ('© 2013 edX,  ', '© 2013 EDX,  '), - ] - for source, expected in test_cases: - result = c.convert(source) - self.assertEquals(result, expected) + source, expected = data + result = UpcaseConverter().convert(source) + self.assertEquals(result, expected) diff --git a/i18n/tests/test_dummy.py b/i18n/tests/test_dummy.py index fbef3910ab..2d1b1b71c3 100644 --- a/i18n/tests/test_dummy.py +++ b/i18n/tests/test_dummy.py @@ -1,11 +1,16 @@ # -*- coding: utf-8 -*- +"""Tests of i18n/dummy.py""" + import os, string, random from unittest import TestCase + +import ddt from polib import POEntry import dummy +@ddt.ddt class TestDummy(TestCase): """ Tests functionality of i18n/dummy.py @@ -27,24 +32,24 @@ class TestDummy(TestCase): "Mismatch: %r != %r" % (str1, str2), ) - def test_dummy(self): + @ddt.data( + (u"hello my name is Bond, James Bond", + u"héllø mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ι#"), + + (u"don't convert tag ids", + u"døn't çønvért täg ïds Ⱡσяєм ιρѕυ#"), + + (u"don't convert %(name)s tags on %(date)s", + u"døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#"), + ) + def test_dummy(self, data): """ Tests with a dummy converter (adds spurious accents to strings). Assert that embedded HTML and python tags are not converted. """ - test_cases = [ - (u"hello my name is Bond, James Bond", - u"héllø mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ι#"), - - (u"don't convert tag ids", - u"døn't çønvért täg ïds Ⱡσяєм ιρѕυ#"), - - (u"don't convert %(name)s tags on %(date)s", - u"døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#"), - ] - for source, expected in test_cases: - result = self.converter.convert(source) - self.assertUnicodeEquals(result, expected) + source, expected = data + result = self.converter.convert(source) + self.assertUnicodeEquals(result, expected) def test_singular(self): entry = POEntry() From 9011e6e17f711c015a99d468474cd5e50e31c079 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 3 Jan 2014 17:41:41 -0500 Subject: [PATCH 5/8] Fix plural handling, put teeth in msgfmt -c test. --- i18n/dummy.py | 20 ++++---------------- i18n/generate.py | 7 +++++-- i18n/make_dummy.py | 8 +++++++- i18n/tests/test_validate.py | 10 ++++++---- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/i18n/dummy.py b/i18n/dummy.py index 987c971447..b192069329 100644 --- a/i18n/dummy.py +++ b/i18n/dummy.py @@ -99,20 +99,6 @@ class Dummy(Converter): """replaces the final char of string with #""" return string[:-1] + '#' - def init_msgs(self, msgs): - """ - Make sure the first msg in msgs has a plural property. - msgs is list of instances of polib.POEntry - """ - if not msgs: - return - headers = msgs[0].get_property('msgstr') - has_plural = any(header.startswith('Plural-Forms:') for header in headers) - if not has_plural: - # Apply declaration for English pluralization rules - plural = "Plural-Forms: nplurals=2; plural=(n != 1);\\n" - headers.append(plural) - def convert_msg(self, msg): """ Takes one POEntry object and converts it (adds a dummy translation to it) @@ -128,8 +114,10 @@ class Dummy(Converter): # translate singular and plural foreign_single = self.convert(source) foreign_plural = self.convert(plural) - plural = {'0': self.final_newline(source, foreign_single), - '1': self.final_newline(plural, foreign_plural)} + plural = { + '0': self.final_newline(source, foreign_single), + '1': self.final_newline(plural, foreign_plural), + } msg.msgstr_plural = plural else: foreign = self.convert(source) diff --git a/i18n/generate.py b/i18n/generate.py index 3d565ba091..8afa93c655 100755 --- a/i18n/generate.py +++ b/i18n/generate.py @@ -60,9 +60,12 @@ def merge(locale, target='django.po', fail_if_missing=True): def clean_metadata(file): """ Clean up redundancies in the metadata caused by merging. - This reads in a PO file and simply saves it back out again. """ - pofile(file).save() + # Reading in the .po file and saving it again fixes redundancies. + pomsgs = pofile(file) + # The msgcat tool marks the metadata as fuzzy, but it's ok as it is. + pomsgs.metadata_is_fuzzy = False + pomsgs.save() def validate_files(dir, files_to_merge): diff --git a/i18n/make_dummy.py b/i18n/make_dummy.py index 1d9be34b10..11021d4036 100755 --- a/i18n/make_dummy.py +++ b/i18n/make_dummy.py @@ -38,9 +38,15 @@ def main(file, locale): raise IOError('File does not exist: %s' % file) pofile = polib.pofile(file) converter = Dummy() - converter.init_msgs(pofile.translated_entries()) for msg in pofile: converter.convert_msg(msg) + + # If any message has a plural, then the file needs plural information. + # Apply declaration for English pluralization rules so that ngettext will + # do something reasonable. + if any(m.msgid_plural for m in pofile): + pofile.metadata['Plural-Forms'] = 'nplurals=2; plural=(n != 1);' + new_file = new_filename(file, locale) create_dir_if_necessary(new_file) pofile.save(new_file) diff --git a/i18n/tests/test_validate.py b/i18n/tests/test_validate.py index 2876f1c2f8..68f69d2b46 100644 --- a/i18n/tests/test_validate.py +++ b/i18n/tests/test_validate.py @@ -12,9 +12,9 @@ def test_po_files(root=LOCALE_DIR): log = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout, level=logging.INFO) - for (dirpath, dirnames, filenames) in os.walk(root): + for dirpath, __, filenames in os.walk(root): for name in filenames: - (base, ext) = os.path.splitext(name) + __, ext = os.path.splitext(name) if ext.lower() == '.po': yield validate_po_file, os.path.join(dirpath, name), log @@ -26,6 +26,8 @@ def validate_po_file(filename, log): """ # Use relative paths to make output less noisy. rfile = os.path.relpath(filename, LOCALE_DIR) - (out, err) = call(['msgfmt','-c', rfile], working_directory=LOCALE_DIR) + out, err = call(['msgfmt', '-c', rfile], working_directory=LOCALE_DIR) if err != '': - log.warn('\n'+err) + log.info('\n' + out) + log.warn('\n' + err) + assert not err From dd94af4bdb6b9d8e131777b0143242b7a34aecfc Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 3 Jan 2014 17:42:50 -0500 Subject: [PATCH 6/8] More validation of .po files: check message structure The tests in test_validate.py now examine the messages to see that they have the right structure. The English and the translation should have the same tags, the translation can't be blank, and it can't have any Astral plane characters. --- i18n/tests/test_validate.py | 108 ++++++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 5 deletions(-) diff --git a/i18n/tests/test_validate.py b/i18n/tests/test_validate.py index 68f69d2b46..54ef1abe2c 100644 --- a/i18n/tests/test_validate.py +++ b/i18n/tests/test_validate.py @@ -1,9 +1,17 @@ -import os, sys, logging -from unittest import TestCase -from nose.plugins.skip import SkipTest +"""Tests that validate .po files.""" + +import codecs +import logging +import os +import sys +import textwrap + +import polib from config import LOCALE_DIR from execute import call +from converter import Converter + def test_po_files(root=LOCALE_DIR): """ @@ -16,10 +24,12 @@ def test_po_files(root=LOCALE_DIR): for name in filenames: __, ext = os.path.splitext(name) if ext.lower() == '.po': - yield validate_po_file, os.path.join(dirpath, name), log + filename = os.path.join(dirpath, name) + yield msgfmt_check_po_file, filename, log + yield check_messages, filename -def validate_po_file(filename, log): +def msgfmt_check_po_file(filename, log): """ Call GNU msgfmt -c on each .po file to validate its format. Any errors caught by msgfmt are logged to log. @@ -31,3 +41,91 @@ def validate_po_file(filename, log): log.info('\n' + out) log.warn('\n' + err) assert not err + + +def tags_in_string(msg): + """ + Return the set of tags in a message string. + + Tags includes HTML tags, data placeholders, etc. + + Skips tags that might change due to translations: HTML entities, , + and so on. + + """ + def is_linguistic_tag(tag): + """Is this tag one that can change with the language?""" + if tag.startswith("&"): + return True + if any(x in tag for x in ["", ""]): + return True + return False + + __, tags = Converter().detag_string(msg) + return set(t for t in tags if not is_linguistic_tag(t)) + + +def astral(msg): + """Does `msg` have characters outside the Basic Multilingual Plane?""" + return any(ord(c) > 0xFFFF for c in msg) + + +def check_messages(filename): + """ + Checks messages in various ways: + + Translations must have the same slots as the English. The translation + must not be empty. Messages can't have astral characters in them. + + """ + # Don't check English files. + if "/locale/en/" in filename: + return + + # problems will be a list of tuples. Each is a description, and a msgid, + # and then zero or more translations. + problems = [] + pomsgs = polib.pofile(filename) + for msg in pomsgs: + # Check for characters Javascript can't support. + # https://code.djangoproject.com/ticket/21725 + if astral(msg.msgstr): + problems.append(("Non-BMP char", msg.msgid, msg.msgstr)) + + if msg.msgid_plural: + # Skip plurals, I don't know how the tags relate. + continue + if not msg.msgstr: + problems.append(("Empty translation", msg.msgid)) + else: + id_tags = tags_in_string(msg.msgid) + tx_tags = tags_in_string(msg.msgstr) + if id_tags != tx_tags: + id_has = u", ".join(u'"{}"'.format(t) for t in id_tags - tx_tags) + tx_has = u", ".join(u'"{}"'.format(t) for t in tx_tags - id_tags) + if id_has and tx_has: + diff = u"{} vs {}".format(id_has, tx_has) + elif id_has: + diff = u"{} missing".format(id_has) + else: + diff = u"{} added".format(tx_has) + problems.append(( + "Different tags in source and translation", + msg.msgid, + msg.msgstr, + diff + )) + + if problems: + problem_file = filename.replace(".po", ".prob") + id_filler = textwrap.TextWrapper(width=79, initial_indent=" msgid: ", subsequent_indent=" " * 9) + tx_filler = textwrap.TextWrapper(width=79, initial_indent=" -----> ", subsequent_indent=" " * 9) + with codecs.open(problem_file, "w", encoding="utf8") as prob_file: + for problem in problems: + desc, msgid = problem[:2] + prob_file.write(u"{}\n{}\n".format(desc, id_filler.fill(msgid))) + for translation in problem[2:]: + prob_file.write(u"{}\n".format(tx_filler.fill(translation))) + prob_file.write(u"\n") + + assert not problems, "Found %d problems in %s, details in .prob file" % (len(problems), filename) From c459633534627200e5149cb7ff99004381d30699 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 3 Jan 2014 17:52:40 -0500 Subject: [PATCH 7/8] Say a little about what's happening when pulling from transifex --- i18n/transifex.py | 1 + 1 file changed, 1 insertion(+) diff --git a/i18n/transifex.py b/i18n/transifex.py index d8fdd2c4bf..8653c901f9 100755 --- a/i18n/transifex.py +++ b/i18n/transifex.py @@ -15,6 +15,7 @@ def push(): def pull(): for locale in CONFIGURATION.locales: if locale != CONFIGURATION.source_locale: + print "Pulling %s from transifex..." % locale execute('tx pull -l %s' % locale) clean_translated_locales() From af120fdf531f4ef004223311c0e7ff509b837dea Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Mon, 6 Jan 2014 15:52:15 -0500 Subject: [PATCH 8/8] Better handling of plurals during .po validation Now any of the plurals being missing will count as an Empty translation, and for looking at tags, the two English strings and all the translated strings are mooshed together. --- i18n/tests/test_validate.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/i18n/tests/test_validate.py b/i18n/tests/test_validate.py index 54ef1abe2c..a7c400da0f 100644 --- a/i18n/tests/test_validate.py +++ b/i18n/tests/test_validate.py @@ -93,13 +93,21 @@ def check_messages(filename): problems.append(("Non-BMP char", msg.msgid, msg.msgstr)) if msg.msgid_plural: - # Skip plurals, I don't know how the tags relate. - continue - if not msg.msgstr: - problems.append(("Empty translation", msg.msgid)) + # Plurals: two strings in, N strings out. + source = msg.msgid + " | " + msg.msgid_plural + translation = " | ".join(v for k,v in sorted(msg.msgstr_plural.items())) + empty = any(not t.strip() for t in msg.msgstr_plural.values()) else: - id_tags = tags_in_string(msg.msgid) - tx_tags = tags_in_string(msg.msgstr) + # Singular: just one string in and one string out. + source = msg.msgid + translation = msg.msgstr + empty = not msg.msgstr.strip() + + if empty: + problems.append(("Empty translation", source)) + else: + id_tags = tags_in_string(source) + tx_tags = tags_in_string(translation) if id_tags != tx_tags: id_has = u", ".join(u'"{}"'.format(t) for t in id_tags - tx_tags) tx_has = u", ".join(u'"{}"'.format(t) for t in tx_tags - id_tags) @@ -111,8 +119,8 @@ def check_messages(filename): diff = u"{} added".format(tx_has) problems.append(( "Different tags in source and translation", - msg.msgid, - msg.msgstr, + source, + translation, diff ))