From f1f76a9ad102580b5f0c4c111d544c4eb2c20eed Mon Sep 17 00:00:00 2001
From: Ned Batchelder <ned@edx.org>
Date: Fri, 20 Dec 2013 10:25:32 -0500
Subject: [PATCH] Dummy text has more accents, and properly ignored more
 non-text things.

---
 i18n/converter.py            | 11 ++++++++++-
 i18n/dummy.py                |  7 +++++--
 i18n/tests/test_converter.py | 10 +++++++---
 i18n/tests/test_dummy.py     | 25 +++++++++++++------------
 4 files changed, 35 insertions(+), 18 deletions(-)
diff --git a/i18n/converter.py b/i18n/converter.py
index e873dcb2a4..d3987bebe2 100644
--- a/i18n/converter.py
+++ b/i18n/converter.py
@@ -20,7 +20,16 @@ class Converter(object):
     # matches tags like these:
     #   HTML:   <B>, </B>, <BR/>, <textformat leading="10">
     #   Python: %(date)s, %(name)s
-    tag_pattern = re.compile(r'(<[-\w" .:?=/]*>)|({[^}]*})|(%\([^)]*\)\w)', re.I)
+    tag_pattern = re.compile(r'''
+        (<[-\w" .:?=/]*>)   |       # <tag>
+        ({[^}]*})           |       # {tag}
+        (%\([^)]*\)\w)      |       # %(tag)s
+        (&\w+;)             |       # &entity;
+        (&\#\d+;)           |       # &#1234;
+        (&\#x[0-9a-f]+;)            # &#xABCD;
+        ''',
+        re.IGNORECASE|re.VERBOSE
+    )
 
     def convert(self, string):
         """Returns: a converted tagged string
diff --git a/i18n/dummy.py b/i18n/dummy.py
index 76283d8704..e82429dcbd 100644
--- a/i18n/dummy.py
+++ b/i18n/dummy.py
@@ -34,8 +34,11 @@ TABLE = {'A': u'\xC0',
          'I': U'\xCC',
          'i': u'\xEF',
          'O': u'\xD8',
-         'o': u'\xF6',
-         'u': u'\xFC'
+         'o': u'\xF8',
+         'U': u'\xDB',
+         'u': u'\xFC',
+         'Y': u'\xDD',
+         'y': u'\xFD',
          }
 
 
diff --git a/i18n/tests/test_converter.py b/i18n/tests/test_converter.py
index 69cfc40260..b1989ede94 100644
--- a/i18n/tests/test_converter.py
+++ b/i18n/tests/test_converter.py
@@ -22,7 +22,7 @@ class TestConverter(TestCase):
         Assert that embedded HTML and python tags are not converted.
         """
         c = UpcaseConverter()
-        test_cases = (
+        test_cases = [
             # no tags
             ('big bad wolf', 'BIG BAD WOLF'),
             # one html tag
@@ -36,7 +36,11 @@ class TestConverter(TestCase):
             # both kinds of tags
             ('<strong>big</strong> %(adjective)s %(noun)s',
              '<strong>BIG</strong> %(adjective)s %(noun)s'),
-            )
-        for (source, expected) in test_cases:
+            # .format-style tags
+            ('The {0} barn is {1!r}.', 'THE {0} BARN IS {1!r}.'),
+            # HTML entities
+            ('<b>&copy; 2013 edX, &#xa0;</b>', '<b>&copy; 2013 EDX, &#xa0;</b>'),
+        ]
+        for source, expected in test_cases:
             result = c.convert(source)
             self.assertEquals(result, expected)
diff --git a/i18n/tests/test_dummy.py b/i18n/tests/test_dummy.py
index 88addb5a95..4670fe5635 100644
--- a/i18n/tests/test_dummy.py
+++ b/i18n/tests/test_dummy.py
@@ -18,23 +18,24 @@ class TestDummy(TestCase):
         Tests with a dummy converter (adds spurious accents to strings).
         Assert that embedded HTML and python tags are not converted.
         """
-        test_cases = (("hello my name is Bond, James Bond",
-                       u'h\xe9ll\xf6 my n\xe4m\xe9 \xefs B\xf6nd, J\xe4m\xe9s B\xf6nd Lorem i#'),
+        test_cases = [
+            ("hello my name is Bond, James Bond",
+             u'h\xe9ll\xf8 m\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd Lorem i#'),
 
-                      ('don\'t convert <a href="href">tag ids</a>',
-                        u'd\xf6n\'t \xe7\xf6nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'),
-                      
-                      ('don\'t convert %(name)s tags on %(date)s',
-                        u"d\xf6n't \xe7\xf6nv\xe9rt %(name)s t\xe4gs \xf6n %(date)s Lorem ips#")
-                      )
-        for (source, expected) in test_cases:
+            ('don\'t convert <a href="href">tag ids</a>',
+             u'd\xf8n\'t \xe7\xf8nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'),
+
+            ('don\'t convert %(name)s tags on %(date)s',
+             u"d\xf8n't \xe7\xf8nv\xe9rt %(name)s t\xe4gs \xf8n %(date)s Lorem ips#")
+        ]
+        for source, expected in test_cases:
             result = self.converter.convert(source)
             self.assertEquals(result, expected)
 
     def test_singular(self):
         entry = POEntry()
         entry.msgid = 'A lovely day for a cup of tea.'
-        expected = u'\xc0 l\xf6v\xe9ly d\xe4y f\xf6r \xe4 \xe7\xfcp \xf6f t\xe9\xe4. Lorem i#'
+        expected = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
         self.converter.convert_msg(entry)
         self.assertEquals(entry.msgstr, expected)
 
@@ -42,8 +43,8 @@ class TestDummy(TestCase):
         entry = POEntry()
         entry.msgid = 'A lovely day for a cup of tea.'
         entry.msgid_plural = 'A lovely day for some cups of tea.'
-        expected_s = u'\xc0 l\xf6v\xe9ly d\xe4y f\xf6r \xe4 \xe7\xfcp \xf6f t\xe9\xe4. Lorem i#'
-        expected_p = u'\xc0 l\xf6v\xe9ly d\xe4y f\xf6r s\xf6m\xe9 \xe7\xfcps \xf6f t\xe9\xe4. Lorem ip#'
+        expected_s = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
+        expected_p = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r s\xf8m\xe9 \xe7\xfcps \xf8f t\xe9\xe4. Lorem ip#'
         self.converter.convert_msg(entry)
         result = entry.msgstr_plural
         self.assertEquals(result['0'], expected_s)