From d4db4c6dea97a675ec423a01d80507de3e12922e Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Thu, 6 Jun 2013 15:46:18 -0400 Subject: [PATCH 1/9] Fix encoding errors when hashing XML We occasionally get UnicodeEncodeErrors from these two places, because the XML is a Unicode string, and is implicitly encoded to ascii. --- common/lib/xmodule/xmodule/error_module.py | 2 +- common/lib/xmodule/xmodule/modulestore/xml.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/common/lib/xmodule/xmodule/error_module.py b/common/lib/xmodule/xmodule/error_module.py index 7f15370176..e998ceb58e 100644 --- a/common/lib/xmodule/xmodule/error_module.py +++ b/common/lib/xmodule/xmodule/error_module.py @@ -87,7 +87,7 @@ class ErrorDescriptor(ErrorFields, JSONEditingDescriptor): # but url_names aren't guaranteed to be unique between descriptor types, # and ErrorDescriptor can wrap any type. When the wrapped module is fixed, # it will be written out with the original url_name. - name=hashlib.sha1(contents).hexdigest() + name=hashlib.sha1(contents.encode('utf8')).hexdigest() ) # real metadata stays in the content, but add a display name diff --git a/common/lib/xmodule/xmodule/modulestore/xml.py b/common/lib/xmodule/xmodule/modulestore/xml.py index 7128c04a88..1618fa1f7f 100644 --- a/common/lib/xmodule/xmodule/modulestore/xml.py +++ b/common/lib/xmodule/xmodule/modulestore/xml.py @@ -108,7 +108,8 @@ class ImportSystem(XMLParsingSystem, MakoDescriptorSystem): orig_name = orig_name[len(tag) + 1:-12] # append the hash of the content--the first 12 bytes should be plenty. orig_name = "_" + orig_name if orig_name not in (None, "") else "" - return tag + orig_name + "_" + hashlib.sha1(xml).hexdigest()[:12] + xml_bytes = xml.encode('utf8') + return tag + orig_name + "_" + hashlib.sha1(xml_bytes).hexdigest()[:12] # Fallback if there was nothing we could use: if url_name is None or url_name == "": From ccaf8178fdd4f01e4b55bc191fb8c04185575b26 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 7 Jun 2013 09:57:51 -0400 Subject: [PATCH 2/9] Change a test to get at some Unicode issues. --- common/lib/xmodule/xmodule/tests/test_import.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/lib/xmodule/xmodule/tests/test_import.py b/common/lib/xmodule/xmodule/tests/test_import.py index a75dfc8d20..fda2b47e6a 100644 --- a/common/lib/xmodule/xmodule/tests/test_import.py +++ b/common/lib/xmodule/xmodule/tests/test_import.py @@ -66,7 +66,8 @@ class ImportTestCase(BaseCourseTestCase): def test_fallback(self): '''Check that malformed xml loads as an ErrorDescriptor.''' - bad_xml = '''''' + # Use an exotic character to also flush out Unicode issues. + bad_xml = u'''''' system = self.get_system() descriptor = system.process_xml(bad_xml) From 69f8bf7afa9e0e8678ce8e1566479268cbc0ec74 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 7 Jun 2013 09:58:12 -0400 Subject: [PATCH 3/9] Fix weird formatting. --- common/lib/xmodule/xmodule/tests/test_import.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/lib/xmodule/xmodule/tests/test_import.py b/common/lib/xmodule/xmodule/tests/test_import.py index fda2b47e6a..bb0d200bb6 100644 --- a/common/lib/xmodule/xmodule/tests/test_import.py +++ b/common/lib/xmodule/xmodule/tests/test_import.py @@ -41,7 +41,7 @@ class DummySystem(ImportSystem): ) def render_template(self, template, context): - raise Exception("Shouldn't be called") + raise Exception("Shouldn't be called") class BaseCourseTestCase(unittest.TestCase): @@ -72,8 +72,7 @@ class ImportTestCase(BaseCourseTestCase): descriptor = system.process_xml(bad_xml) - self.assertEqual(descriptor.__class__.__name__, - 'ErrorDescriptor') + self.assertEqual(descriptor.__class__.__name__, 'ErrorDescriptor') def test_unique_url_names(self): '''Check that each error gets its very own url_name''' From aac7e364d371c867f5164b4edd23ba43e91c0f91 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 7 Jun 2013 17:05:58 -0400 Subject: [PATCH 4/9] Make the repr right. --- common/lib/xmodule/xmodule/modulestore/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/lib/xmodule/xmodule/modulestore/xml.py b/common/lib/xmodule/xmodule/modulestore/xml.py index 1618fa1f7f..4ea83d7e11 100644 --- a/common/lib/xmodule/xmodule/modulestore/xml.py +++ b/common/lib/xmodule/xmodule/modulestore/xml.py @@ -323,7 +323,7 @@ class XMLModuleStore(ModuleStoreBase): ''' String representation - for debugging ''' - return 'data_dir=%s, %d courses, %d modules' % ( + return '' % ( self.data_dir, len(self.courses), len(self.modules)) def load_policy(self, policy_path, tracker): From 4896e445890021517e28c6b54a9c7856581fd5b7 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 7 Jun 2013 17:07:33 -0400 Subject: [PATCH 5/9] Write a test for the XML unicode conversion fix. --- .../xmodule/modulestore/tests/test_xml.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py b/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py index 321d98967b..3859049838 100644 --- a/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py +++ b/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py @@ -1,6 +1,5 @@ -from xmodule.modulestore import Location +from xmodule.course_module import CourseDescriptor from xmodule.modulestore.xml import XMLModuleStore -from xmodule.modulestore.xml_importer import import_from_xml from .test_modulestore import check_path_to_location from . import DATA_DIR @@ -15,3 +14,16 @@ class TestXMLModuleStore(object): print "finished import" check_path_to_location(modulestore) + + def test_unicode_chars_in_xml_content(self): + # edX/full/6.002_Spring_2012 has non-ASCII chars, and during + # uniquification of names, would raise a UnicodeError. It no longer does. + + # Load the course, but don't make error modules. This will succeed, + # but will record the errors. + modulestore = XMLModuleStore(DATA_DIR, course_dirs=['full'], load_error_modules=False) + + # Look up the errors during load. There should be none. + location = CourseDescriptor.id_to_location("edX/full/6.002_Spring_2012") + errors = modulestore.get_item_errors(location) + assert errors == [] From 979ee6844e655f13d7f7ce0b03461603f6f423fc Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 7 Jun 2013 19:05:01 -0400 Subject: [PATCH 6/9] Add to the test, a check that the XML file truly has a non-ASCII character in it. --- .../lib/xmodule/xmodule/modulestore/tests/test_xml.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py b/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py index 3859049838..375fb32bf0 100644 --- a/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py +++ b/common/lib/xmodule/xmodule/modulestore/tests/test_xml.py @@ -1,6 +1,10 @@ +import os.path + from xmodule.course_module import CourseDescriptor from xmodule.modulestore.xml import XMLModuleStore +from nose.tools import assert_raises + from .test_modulestore import check_path_to_location from . import DATA_DIR @@ -19,6 +23,12 @@ class TestXMLModuleStore(object): # edX/full/6.002_Spring_2012 has non-ASCII chars, and during # uniquification of names, would raise a UnicodeError. It no longer does. + # Ensure that there really is a non-ASCII character in the course. + with open(os.path.join(DATA_DIR, "full/sequential/Administrivia_and_Circuit_Elements.xml")) as xmlf: + xml = xmlf.read() + with assert_raises(UnicodeDecodeError): + xml.decode('ascii') + # Load the course, but don't make error modules. This will succeed, # but will record the errors. modulestore = XMLModuleStore(DATA_DIR, course_dirs=['full'], load_error_modules=False) From a841bf7d2eae03d6870a45153cc7d50279ab5b5c Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Mon, 10 Jun 2013 12:57:39 -0400 Subject: [PATCH 7/9] Use repr instead of str for getting context data via the system render function, so it works better for non-ASCII data. --- common/lib/xmodule/xmodule/tests/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/lib/xmodule/xmodule/tests/__init__.py b/common/lib/xmodule/xmodule/tests/__init__.py index 6af11a3ac8..4bcf2caa6d 100644 --- a/common/lib/xmodule/xmodule/tests/__init__.py +++ b/common/lib/xmodule/xmodule/tests/__init__.py @@ -33,8 +33,8 @@ def test_system(): """ Construct a test ModuleSystem instance. - By default, the render_template() method simply returns the context it is - passed as a string. You can override this behavior by monkey patching:: + By default, the render_template() method simply returns the repr of the + context it is passed. You can override this behavior by monkey patching:: system = test_system() system.render_template = my_render_func @@ -46,7 +46,7 @@ def test_system(): ajax_url='courses/course_id/modx/a_location', track_function=Mock(), get_module=Mock(), - render_template=lambda template, context: str(context), + render_template=lambda template, context: repr(context), replace_urls=lambda html: str(html), user=Mock(is_staff=False), filestore=Mock(), From 5bc7333ca803b620e880bdc41f36e3cc433029fc Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Mon, 10 Jun 2013 12:58:10 -0400 Subject: [PATCH 8/9] Test the Unicode handling during construction of an ErrorModule directly. --- common/lib/xmodule/xmodule/tests/test_error_module.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/common/lib/xmodule/xmodule/tests/test_error_module.py b/common/lib/xmodule/xmodule/tests/test_error_module.py index 78e5b46a96..bee906afd6 100644 --- a/common/lib/xmodule/xmodule/tests/test_error_module.py +++ b/common/lib/xmodule/xmodule/tests/test_error_module.py @@ -18,8 +18,7 @@ class TestErrorModule(unittest.TestCase): self.org = "org" self.course = "course" self.location = Location(['i4x', self.org, self.course, None, None]) - self.valid_xml = "" - self.broken_xml = "" + self.valid_xml = u"ABC \N{SNOWMAN}" self.error_msg = "Error" def test_error_module_xml_rendering(self): @@ -29,7 +28,7 @@ class TestErrorModule(unittest.TestCase): module = descriptor.xmodule(self.system) rendered_html = module.get_html() self.assertIn(self.error_msg, rendered_html) - self.assertIn(self.valid_xml, rendered_html) + self.assertIn(repr(self.valid_xml), rendered_html) def test_error_module_from_descriptor(self): descriptor = MagicMock([XModuleDescriptor], @@ -43,7 +42,7 @@ class TestErrorModule(unittest.TestCase): module = error_descriptor.xmodule(self.system) rendered_html = module.get_html() self.assertIn(self.error_msg, rendered_html) - self.assertIn(str(descriptor), rendered_html) + self.assertIn(repr(descriptor), rendered_html) class TestNonStaffErrorModule(TestErrorModule): @@ -62,7 +61,7 @@ class TestNonStaffErrorModule(TestErrorModule): module = descriptor.xmodule(self.system) rendered_html = module.get_html() self.assertNotIn(self.error_msg, rendered_html) - self.assertNotIn(self.valid_xml, rendered_html) + self.assertNotIn(repr(self.valid_xml), rendered_html) def test_error_module_from_descriptor(self): descriptor = MagicMock([XModuleDescriptor], From fea679263733c80998d171e8aeecd0d6c748003b Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Mon, 10 Jun 2013 13:11:10 -0400 Subject: [PATCH 9/9] In these tests, 'rendered_html' is neither rendered, nor HTML. Name it context_repr, since that's what it is. --- .../xmodule/tests/test_error_module.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/common/lib/xmodule/xmodule/tests/test_error_module.py b/common/lib/xmodule/xmodule/tests/test_error_module.py index bee906afd6..b980107dd9 100644 --- a/common/lib/xmodule/xmodule/tests/test_error_module.py +++ b/common/lib/xmodule/xmodule/tests/test_error_module.py @@ -26,9 +26,9 @@ class TestErrorModule(unittest.TestCase): self.valid_xml, self.system, self.org, self.course, self.error_msg) self.assertTrue(isinstance(descriptor, error_module.ErrorDescriptor)) module = descriptor.xmodule(self.system) - rendered_html = module.get_html() - self.assertIn(self.error_msg, rendered_html) - self.assertIn(repr(self.valid_xml), rendered_html) + context_repr = module.get_html() + self.assertIn(self.error_msg, context_repr) + self.assertIn(repr(self.valid_xml), context_repr) def test_error_module_from_descriptor(self): descriptor = MagicMock([XModuleDescriptor], @@ -40,9 +40,9 @@ class TestErrorModule(unittest.TestCase): descriptor, self.error_msg) self.assertTrue(isinstance(error_descriptor, error_module.ErrorDescriptor)) module = error_descriptor.xmodule(self.system) - rendered_html = module.get_html() - self.assertIn(self.error_msg, rendered_html) - self.assertIn(repr(descriptor), rendered_html) + context_repr = module.get_html() + self.assertIn(self.error_msg, context_repr) + self.assertIn(repr(descriptor), context_repr) class TestNonStaffErrorModule(TestErrorModule): @@ -59,9 +59,9 @@ class TestNonStaffErrorModule(TestErrorModule): descriptor = error_module.NonStaffErrorDescriptor.from_xml( self.valid_xml, self.system, self.org, self.course) module = descriptor.xmodule(self.system) - rendered_html = module.get_html() - self.assertNotIn(self.error_msg, rendered_html) - self.assertNotIn(repr(self.valid_xml), rendered_html) + context_repr = module.get_html() + self.assertNotIn(self.error_msg, context_repr) + self.assertNotIn(repr(self.valid_xml), context_repr) def test_error_module_from_descriptor(self): descriptor = MagicMock([XModuleDescriptor], @@ -73,6 +73,6 @@ class TestNonStaffErrorModule(TestErrorModule): descriptor, self.error_msg) self.assertTrue(isinstance(error_descriptor, error_module.ErrorDescriptor)) module = error_descriptor.xmodule(self.system) - rendered_html = module.get_html() - self.assertNotIn(self.error_msg, rendered_html) - self.assertNotIn(str(descriptor), rendered_html) + context_repr = module.get_html() + self.assertNotIn(self.error_msg, context_repr) + self.assertNotIn(str(descriptor), context_repr)