From 209f8cc3dbc8c227f0e37cc4e9ab3f1cc6063e27 Mon Sep 17 00:00:00 2001
From: Ned Batchelder <ned@edx.org>
Date: Fri, 27 Dec 2013 16:34:43 -0500
Subject: [PATCH] Lorem is now fancy, and HTML tags are recognized with
 single-quote attributes.

---
 i18n/converter.py            |   6 +-
 i18n/dummy.py                | 106 ++++++++++++++++++++---------------
 i18n/tests/test_converter.py |   3 +
 i18n/tests/test_dummy.py     |  40 ++++++++-----
 lms/templates/login.html     |   2 +-
 5 files changed, 94 insertions(+), 63 deletions(-)
diff --git a/i18n/converter.py b/i18n/converter.py
index d3987bebe2..9a982347ee 100644
--- a/i18n/converter.py
+++ b/i18n/converter.py
@@ -21,9 +21,9 @@ class Converter(object):
     #   HTML:   <B>, </B>, <BR/>, <textformat leading="10">
     #   Python: %(date)s, %(name)s
     tag_pattern = re.compile(r'''
-        (<[-\w" .:?=/]*>)   |       # <tag>
-        ({[^}]*})           |       # {tag}
-        (%\([^)]*\)\w)      |       # %(tag)s
+        (<[^>]+>)           |       # <tag>
+        ({[^}]+})           |       # {tag}
+        (%\([\w]+\)\w)      |       # %(tag)s
         (&\w+;)             |       # &entity;
         (&\#\d+;)           |       # &#1234;
         (&\#x[0-9a-f]+;)            # &#xABCD;
diff --git a/i18n/dummy.py b/i18n/dummy.py
index e82429dcbd..987c971447 100644
--- a/i18n/dummy.py
+++ b/i18n/dummy.py
@@ -1,56 +1,70 @@
+# -*- coding: utf-8 -*-
+r"""
+Creates new localization properties files in a dummy language.
+
+Each property file is derived from the equivalent en_US file, with these
+transformations applied:
+
+1. Every vowel is replaced with an equivalent with extra accent marks.
+
+2. Every string is padded out to +30% length to simulate verbose languages
+   (such as German) to see if layout and flows work properly.
+
+3. Every string is terminated with a '#' character to make it easier to detect
+   truncation.
+
+Example use::
+
+    >>> from dummy import Dummy
+    >>> c = Dummy()
+    >>> c.convert("My name is Bond, James Bond")
+    u'M\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd \u2360\u03c3\u044f\u0454\u043c \u03b9\u03c1#'
+    >>> print c.convert("My name is Bond, James Bond")
+    Mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ιρ#
+    >>> print c.convert("don't convert <a href='href'>tag ids</a>")
+    døn't çønvért <a href='href'>täg ïds</a> Ⱡσяєм ιρѕυ#
+    >>> print c.convert("don't convert %(name)s tags on %(date)s")
+    døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#
+
+"""
+
 from converter import Converter
 
-# Creates new localization properties files in a dummy language
-# Each property file is derived from the equivalent en_US file, except
-# 1. Every vowel is replaced with an equivalent with extra accent marks
-# 2. Every string is padded out to +30% length to simulate verbose languages (e.g. German)
-#    to see if layout and flows work properly
-# 3. Every string is terminated with a '#' character to make it easier to detect truncation
-
-
-# --------------------------------
-# Example use:
-# >>> from dummy import Dummy
-# >>> c = Dummy()
-# >>> c.convert("hello my name is Bond, James Bond")
-# u'h\xe9ll\xf6 my n\xe4m\xe9 \xefs B\xf6nd, J\xe4m\xe9s B\xf6nd Lorem i#'
-#
-# >>> c.convert('don\'t convert <a href="href">tag ids</a>')
-# u'd\xf6n\'t \xe7\xf6nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'
-#
-# >>> c.convert('don\'t convert %(name)s tags on %(date)s')
-# u"d\xf6n't \xe7\xf6nv\xe9rt %(name)s t\xe4gs \xf6n %(date)s Lorem ips#"
-
-
 # Substitute plain characters with accented lookalikes.
 # http://tlt.its.psu.edu/suggestions/international/web/codehtml.html#accent
-TABLE = {'A': u'\xC0',
-         'a': u'\xE4',
-         'b': u'\xDF',
-         'C': u'\xc7',
-         'c': u'\xE7',
-         'E': u'\xC9',
-         'e': u'\xE9',
-         'I': U'\xCC',
-         'i': u'\xEF',
-         'O': u'\xD8',
-         'o': u'\xF8',
-         'U': u'\xDB',
-         'u': u'\xFC',
-         'Y': u'\xDD',
-         'y': u'\xFD',
-         }
-
+TABLE = {
+    'A': u'À',
+    'a': u'ä',
+    'b': u'ß',
+    'C': u'Ç',
+    'c': u'ç',
+    'E': u'É',
+    'e': u'é',
+    'I': u'Ì',
+    'i': u'ï',
+    'O': u'Ø',
+    'o': u'ø',
+    'U': u'Û',
+    'u': u'ü',
+    'Y': u'Ý',
+    'y': u'ý',
+}
 
 
 # The print industry's standard dummy text, in use since the 1500s
-# see http://www.lipsum.com/
-LOREM = ' Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed ' \
-        'do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad ' \
-        'minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ' \
-        'ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate ' \
-        'velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat ' \
-        'cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. '
+# see http://www.lipsum.com/, then fed through a "fancy-text" converter.
+# The string should start with a space.
+LOREM = " " + " ".join(     # join and split just make the string easier here.
+    u"""
+    Ⱡσяєм ιρѕυм ∂σłσя ѕιт αмєт, ¢σηѕє¢тєтυя α∂ιριѕι¢ιηg єłιт, ѕє∂ ∂σ єιυѕмσ∂
+    тємρσя ιη¢ι∂ι∂υηт υт łαвσяє єт ∂σłσяє мαgηα αłιqυα. υт єηιм α∂ мιηιм
+    νєηιαм, qυιѕ ησѕтяυ∂ єχєя¢ιтαтιση υłłαм¢σ łαвσяιѕ ηιѕι υт αłιqυιρ єχ єα
+    ¢σммσ∂σ ¢σηѕєqυαт.  ∂υιѕ αυтє ιяυяє ∂σłσя ιη яєρяєнєη∂єяιт ιη νσłυρтαтє
+    νєłιт єѕѕє ¢ιłłυм ∂σłσяє єυ ƒυgιαт ηυłłα ραяιαтυя. єχ¢єρтєυя ѕιηт σ¢¢αє¢αт
+    ¢υρι∂αтαт ηση ρяσι∂єηт, ѕυηт ιη ¢υłρα qυι σƒƒι¢ια ∂єѕєяυηт мσłłιт αηιм ι∂
+    єѕт łαвσяυм.
+    """.split()
+)
 
 # To simulate more verbose languages (like German), pad the length of a string
 # by a multiple of PAD_FACTOR
diff --git a/i18n/tests/test_converter.py b/i18n/tests/test_converter.py
index b1989ede94..f2fec593d4 100644
--- a/i18n/tests/test_converter.py
+++ b/i18n/tests/test_converter.py
@@ -29,6 +29,9 @@ class TestConverter(TestCase):
             ('big <strong>bad</strong> wolf', 'BIG <strong>BAD</strong> WOLF'),
             # two html tags
             ('big <b>bad</b> <i>wolf</i>', 'BIG <b>BAD</b> <i>WOLF</i>'),
+            # html tags with attributes
+            ('<a href="foo">bar</a> baz', '<a href="foo">BAR</a> BAZ'),
+            ("<a href='foo'>bar</a> baz", "<a href='foo'>BAR</a> BAZ"),
             # one python tag
             ('big %(adjective)s wolf', 'BIG %(adjective)s WOLF'),
             # two python tags
diff --git a/i18n/tests/test_dummy.py b/i18n/tests/test_dummy.py
index 4670fe5635..fbef3910ab 100644
--- a/i18n/tests/test_dummy.py
+++ b/i18n/tests/test_dummy.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import os, string, random
 from unittest import TestCase
 from polib import POEntry
@@ -13,39 +14,52 @@ class TestDummy(TestCase):
     def setUp(self):
         self.converter = dummy.Dummy()
 
+    def assertUnicodeEquals(self, str1, str2):
+        """Just like assertEquals, but doesn't put Unicode into the fail message.
+
+        Either nose, or rake, or something, deals very badly with unusual
+        Unicode characters in the assertions, so we use repr here to keep
+        things safe.
+
+        """
+        self.assertEquals(
+            str1, str2,
+            "Mismatch: %r != %r" % (str1, str2),
+        )
+
     def test_dummy(self):
         """
         Tests with a dummy converter (adds spurious accents to strings).
         Assert that embedded HTML and python tags are not converted.
         """
         test_cases = [
-            ("hello my name is Bond, James Bond",
-             u'h\xe9ll\xf8 m\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd Lorem i#'),
+            (u"hello my name is Bond, James Bond",
+             u"héllø mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ι#"),
 
-            ('don\'t convert <a href="href">tag ids</a>',
-             u'd\xf8n\'t \xe7\xf8nv\xe9rt <a href="href">t\xe4g \xefds</a> Lorem ipsu#'),
+            (u"don't convert <a href='href'>tag ids</a>",
+             u"døn't çønvért <a href='href'>täg ïds</a> Ⱡσяєм ιρѕυ#"),
 
-            ('don\'t convert %(name)s tags on %(date)s',
-             u"d\xf8n't \xe7\xf8nv\xe9rt %(name)s t\xe4gs \xf8n %(date)s Lorem ips#")
+            (u"don't convert %(name)s tags on %(date)s",
+             u"døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#"),
         ]
         for source, expected in test_cases:
             result = self.converter.convert(source)
-            self.assertEquals(result, expected)
+            self.assertUnicodeEquals(result, expected)
 
     def test_singular(self):
         entry = POEntry()
         entry.msgid = 'A lovely day for a cup of tea.'
-        expected = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
+        expected = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#'
         self.converter.convert_msg(entry)
-        self.assertEquals(entry.msgstr, expected)
+        self.assertUnicodeEquals(entry.msgstr, expected)
 
     def test_plural(self):
         entry = POEntry()
         entry.msgid = 'A lovely day for a cup of tea.'
         entry.msgid_plural = 'A lovely day for some cups of tea.'
-        expected_s = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r \xe4 \xe7\xfcp \xf8f t\xe9\xe4. Lorem i#'
-        expected_p = u'\xc0 l\xf8v\xe9l\xfd d\xe4\xfd f\xf8r s\xf8m\xe9 \xe7\xfcps \xf8f t\xe9\xe4. Lorem ip#'
+        expected_s = u'À løvélý däý før ä çüp øf téä. Ⱡσяєм ι#'
+        expected_p = u'À løvélý däý før sømé çüps øf téä. Ⱡσяєм ιρ#'
         self.converter.convert_msg(entry)
         result = entry.msgstr_plural
-        self.assertEquals(result['0'], expected_s)
-        self.assertEquals(result['1'], expected_p)
+        self.assertUnicodeEquals(result['0'], expected_s)
+        self.assertUnicodeEquals(result['1'], expected_p)
diff --git a/lms/templates/login.html b/lms/templates/login.html
index 72d903eed2..877751495b 100644
--- a/lms/templates/login.html
+++ b/lms/templates/login.html
@@ -89,7 +89,7 @@
         $submitButton.
           addClass('is-disabled').
           prop('disabled', true).
-          html(gettext('Processing your account information &hellip;'));
+          html("${_(u'Processing your account information…')}");
       }
     }
   </script>