Fix html file handling.

* html files are now stored as follows: If the html file is valid xml, store as html/stuff.xml If it's not, store as html/stuff.xml, which contains <html meta1="..." filename="stuff.html">, and html/stuff.html, which actually contains the contents. Warn if the contents are not parseable with lxml's html parser, but don't error. * for parseable html, strip out the html tag when storing, so that it isn't rendered into the middle of a page * lots of backcompat to deal with paths. Can go away soon. * fix output ordering in clean_xml
2012-07-30 18:36:02 -04:00
parent 46775386d3
commit ed35cefa29
6 changed files with 184 additions and 51 deletions
--- a/common/lib/xmodule/html_checker.py
+++ b/common/lib/xmodule/html_checker.py
@@ -0,0 +1,14 @@
+from lxml import etree
+
+def check_html(html):
+    '''
+    Check whether the passed in html string can be parsed by lxml.
+    Return bool success.
+    '''
+    parser = etree.HTMLParser()
+    try:
+        etree.fromstring(html, parser)
+        return True
+    except Exception as err:
+        pass
+    return False
--- a/common/lib/xmodule/stringify.py
+++ b/common/lib/xmodule/stringify.py
@@ -0,0 +1,20 @@
+from itertools import chain
+from lxml import etree
+
+def stringify_children(node):
+    '''
+    Return all contents of an xml tree, without the outside tags.
+    e.g. if node is parse of
+        "<html a="b" foo="bar">Hi <div>there <span>Bruce</span><b>!</b></div><html>"
+    should return
+        "Hi <div>there <span>Bruce</span><b>!</b></div>"
+
+    fixed from
+    http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
+    '''
+    parts = ([node.text] +
+            list(chain(*([etree.tostring(c), c.tail]
+                         for c in node.getchildren())
+                         )))
+    # filter removes possible Nones in texts and tails
+    return ''.join(filter(None, parts))
--- a/common/lib/xmodule/tests/test_stringify.py
+++ b/common/lib/xmodule/tests/test_stringify.py
@@ -0,0 +1,9 @@
+from nose.tools import assert_equals
+from lxml import etree
+from stringify import stringify_children
+
+def test_stringify():
+    html = '''<html a="b" foo="bar">Hi <div x="foo">there <span>Bruce</span><b>!</b></div></html>'''
+    xml = etree.fromstring(html)
+    out = stringify_children(xml)
+    assert_equals(out, '''Hi <div x="foo">there <span>Bruce</span><b>!</b></div>''')
--- a/common/lib/xmodule/xmodule/html_module.py
+++ b/common/lib/xmodule/xmodule/html_module.py
@@ -1,13 +1,17 @@
+import copy
+from fs.errors import ResourceNotFoundError
 import logging
 import os
 from lxml import etree

 from xmodule.x_module import XModule
-from xmodule.raw_module import RawDescriptor
+from xmodule.xml_module import XmlDescriptor
+from xmodule.editing_module import EditingDescriptor
+from stringify import stringify_children
+from html_checker import check_html

 log = logging.getLogger("mitx.courseware")

-
 class HtmlModule(XModule):
    def get_html(self):
        return self.html
@@ -19,33 +23,108 @@ class HtmlModule(XModule):
        self.html = self.definition['data']


-class HtmlDescriptor(RawDescriptor):
+class HtmlDescriptor(XmlDescriptor, EditingDescriptor):
    """
    Module for putting raw html in a course
    """
    mako_template = "widgets/html-edit.html"
    module_class = HtmlModule
-    filename_extension = "html"
+    filename_extension = "xml"

-    # TODO (cpennington): Delete this method once all fall 2012 course are being
-    # edited in the cms
+    # VS[compat] TODO (cpennington): Delete this method once all fall 2012 course
+    # are being edited in the cms
    @classmethod
    def backcompat_paths(cls, path):
-        if path.endswith('.html.html'):
-            path = path[:-5]
+        origpath = path
+        if path.endswith('.html.xml'):
+            path = path[:-9] + '.html'  #backcompat--look for html instead of xml
        candidates = []
        while os.sep in path:
            candidates.append(path)
            _, _, path = path.partition(os.sep)

+        # also look for .html versions instead of .xml
+        if origpath.endswith('.xml'):
+            candidates.append(origpath[:-4] + '.html')
        return candidates

+    # NOTE: html descriptors are special.  We do not want to parse and
+    # export them ourselves, because that can break things (e.g. lxml
+    # adds body tags when it exports, but they should just be html
+    # snippets that will be included in the middle of pages.
+
    @classmethod
-    def file_to_xml(cls, file_object):
-        parser = etree.HTMLParser()
-        return etree.parse(file_object, parser).getroot()
+    def definition_loader(cls, xml_object, system):
+        '''Load a descriptor from the specified xml_object:
+
+        If there is a filename attribute, load it as a string, and
+        log a warning if it is not parseable by etree.HTMLParser.
+
+        If there is not a filename attribute, the definition is the body
+        of the xml_object, without the root tag (do not want <html> in the
+        middle of a page)
+        '''
+        filename = xml_object.get('filename')
+        if filename is None:
+            definition_xml = copy.deepcopy(xml_object)
+            cls.clean_metadata_from_xml(definition_xml)
+            return {'data' : stringify_children(definition_xml)}
+        else:
+            filepath = cls._format_filepath(xml_object.tag, filename)
+
+            # VS[compat]
+            # TODO (cpennington): If the file doesn't exist at the right path,
+            # give the class a chance to fix it up. The file will be written out
+            # again in the correct format.  This should go away once the CMS is
+            # online and has imported all current (fall 2012) courses from xml
+            if not system.resources_fs.exists(filepath):
+                candidates = cls.backcompat_paths(filepath)
+                #log.debug("candidates = {0}".format(candidates))
+                for candidate in candidates:
+                    if system.resources_fs.exists(candidate):
+                        filepath = candidate
+                        break
+
+            try:
+                with system.resources_fs.open(filepath) as file:
+                    html = file.read()
+                    # Log a warning if we can't parse the file, but don't error
+                    if not check_html(html):
+                        log.warning("Couldn't parse html in {0}.".format(filepath))
+                    return {'data' : html}
+            except (ResourceNotFoundError) as err:
+                msg = 'Unable to load file contents at path {0}: {1} '.format(
+                    filepath, err)
+                log.error(msg)
+                raise

    @classmethod
    def split_to_file(cls, xml_object):
-        # never include inline html
+        '''Never include inline html'''
        return True
+
+
+    # TODO (vshnayder): make export put things in the right places.
+
+    def definition_to_xml(self, resource_fs):
+        '''If the contents are valid xml, write them to filename.xml.  Otherwise,
+        write just the <html filename=""> tag to filename.xml, and the html
+        string to filename.html.
+        '''
+        try:
+            return etree.fromstring(self.definition['data'])
+        except etree.XMLSyntaxError:
+            pass
+
+        # Not proper format.  Write html to file, return an empty tag
+        filepath = u'{category}/{name}.html'.format(category=self.category,
+                                                    name=self.name)
+
+        resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
+        with resource_fs.open(filepath, 'w') as file:
+            file.write(self.definition['data'])
+
+        elt = etree.Element('html')
+        elt.set("filename", self.name)
+        return elt
+
--- a/common/lib/xmodule/xmodule/xml_module.py
+++ b/common/lib/xmodule/xmodule/xml_module.py
@@ -162,6 +162,52 @@ class XmlDescriptor(XModuleDescriptor):
        """
        return etree.parse(file_object).getroot()

+    @classmethod
+    def definition_loader(cls, xml_object, system, location):
+        '''Load a descriptor definition from the specified xml_object.
+        Subclasses should not need to override this except in special
+        cases (e.g. html module)'''
+
+        filename = xml_object.get('filename')
+        if filename is None:
+            definition_xml = copy.deepcopy(xml_object)
+        else:
+            filepath = cls._format_filepath(xml_object.tag, filename)
+
+            # VS[compat]
+            # TODO (cpennington): If the file doesn't exist at the right path,
+            # give the class a chance to fix it up. The file will be written out again
+            # in the correct format.
+            # This should go away once the CMS is online and has imported all current (fall 2012)
+            # courses from xml
+            if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
+                candidates = cls.backcompat_paths(filepath)
+                for candidate in candidates:
+                    if system.resources_fs.exists(candidate):
+                        filepath = candidate
+                        break
+
+            try:
+                with system.resources_fs.open(filepath) as file:
+                    definition_xml = cls.file_to_xml(file)
+            except (ResourceNotFoundError, etree.XMLSyntaxError):
+                msg = 'Unable to load file contents at path %s for item %s' % (
+                    filepath, location.url())
+
+                log.exception(msg)
+                system.error_handler(msg)
+                # if error_handler didn't reraise, work around problem.
+                error_elem = etree.Element('error')
+                message_elem = etree.SubElement(error_elem, 'error_message')
+                message_elem.text = msg
+                stack_elem = etree.SubElement(error_elem, 'stack_trace')
+                stack_elem.text = traceback.format_exc()
+                return {'data': etree.tostring(error_elem)}
+
+        cls.clean_metadata_from_xml(definition_xml)
+        return cls.definition_from_xml(definition_xml, system)
+
+
    @classmethod
    def from_xml(cls, xml_data, system, org=None, course=None):
        """
@@ -191,44 +237,8 @@ class XmlDescriptor(XModuleDescriptor):
                    metadata[attr_map.metadata_key] = attr_map.to_metadata(val)
            return metadata

-        def definition_loader():
-            filename = xml_object.get('filename')
-            if filename is None:
-                definition_xml = copy.deepcopy(xml_object)
-            else:
-                filepath = cls._format_filepath(xml_object.tag, filename)
-
-                # VS[compat]
-                # TODO (cpennington): If the file doesn't exist at the right path,
-                # give the class a chance to fix it up. The file will be written out again
-                # in the correct format.
-                # This should go away once the CMS is online and has imported all current (fall 2012)
-                # courses from xml
-                if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
-                    candidates = cls.backcompat_paths(filepath)
-                    for candidate in candidates:
-                        if system.resources_fs.exists(candidate):
-                            filepath = candidate
-                            break
-
-                try:
-                    with system.resources_fs.open(filepath) as file:
-                        definition_xml = cls.file_to_xml(file)
-                except (ResourceNotFoundError, etree.XMLSyntaxError):
-                    msg = 'Unable to load file contents at path %s for item %s' % (filepath, location.url())
-                    log.exception(msg)
-                    system.error_handler(msg)
-                    # if error_handler didn't reraise, work around problem.
-                    error_elem = etree.Element('error')
-                    message_elem = etree.SubElement(error_elem, 'error_message')
-                    message_elem.text = msg
-                    stack_elem = etree.SubElement(error_elem, 'stack_trace')
-                    stack_elem.text = traceback.format_exc()
-                    return {'data': etree.tostring(error_elem)}
-
-            cls.clean_metadata_from_xml(definition_xml)
-            return cls.definition_from_xml(definition_xml, system)
-
+        definition = cls.definition_loader(xml_object, system, location)
+        metadata = metadata_loader()
        # VS[compat] -- just have the url_name lookup once translation is done
        slug = xml_object.get('url_name', xml_object.get('slug'))
        return cls(
@@ -283,7 +293,7 @@ class XmlDescriptor(XModuleDescriptor):

        # Write it to a file if necessary
        if self.split_to_file(xml_object):
-            # Put this object in it's own file
+            # Put this object in its own file
            filepath = self.__class__._format_filepath(self.category, self.name)
            resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
            with resource_fs.open(filepath, 'w') as file:
--- a/lms/djangoapps/courseware/management/commands/clean_xml.py
+++ b/lms/djangoapps/courseware/management/commands/clean_xml.py
@@ -143,6 +143,7 @@ def check_roundtrip(course_dir):
    # dircmp doesn't do recursive diffs.
    # diff = dircmp(course_dir, export_dir, ignore=[], hide=[])
    print "======== Roundtrip diff: ========="
+    sys.stdout.flush()  # needed to make diff appear in the right place
    os.system("diff -r {0} {1}".format(course_dir, export_dir))
    print "======== ideally there is no diff above this ======="