diff --git a/common/lib/xmodule/html_checker.py b/common/lib/xmodule/html_checker.py
new file mode 100644
index 0000000000..5e6b417d28
--- /dev/null
+++ b/common/lib/xmodule/html_checker.py
@@ -0,0 +1,14 @@
+from lxml import etree
+
+def check_html(html):
+ '''
+ Check whether the passed in html string can be parsed by lxml.
+ Return bool success.
+ '''
+ parser = etree.HTMLParser()
+ try:
+ etree.fromstring(html, parser)
+ return True
+ except Exception as err:
+ pass
+ return False
diff --git a/common/lib/xmodule/stringify.py b/common/lib/xmodule/stringify.py
new file mode 100644
index 0000000000..dad964140f
--- /dev/null
+++ b/common/lib/xmodule/stringify.py
@@ -0,0 +1,20 @@
+from itertools import chain
+from lxml import etree
+
+def stringify_children(node):
+ '''
+ Return all contents of an xml tree, without the outside tags.
+ e.g. if node is parse of
+ "Hi
there Bruce!
"
+ should return
+ "Hi there Bruce!
"
+
+ fixed from
+ http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
+ '''
+ parts = ([node.text] +
+ list(chain(*([etree.tostring(c), c.tail]
+ for c in node.getchildren())
+ )))
+ # filter removes possible Nones in texts and tails
+ return ''.join(filter(None, parts))
diff --git a/common/lib/xmodule/tests/test_stringify.py b/common/lib/xmodule/tests/test_stringify.py
new file mode 100644
index 0000000000..62d7683886
--- /dev/null
+++ b/common/lib/xmodule/tests/test_stringify.py
@@ -0,0 +1,9 @@
+from nose.tools import assert_equals
+from lxml import etree
+from stringify import stringify_children
+
+def test_stringify():
+ html = '''Hi there Bruce!
'''
+ xml = etree.fromstring(html)
+ out = stringify_children(xml)
+ assert_equals(out, '''Hi there Bruce!
''')
diff --git a/common/lib/xmodule/xmodule/html_module.py b/common/lib/xmodule/xmodule/html_module.py
index b9bc34aed6..22d3bf163f 100644
--- a/common/lib/xmodule/xmodule/html_module.py
+++ b/common/lib/xmodule/xmodule/html_module.py
@@ -1,13 +1,17 @@
+import copy
+from fs.errors import ResourceNotFoundError
import logging
import os
from lxml import etree
from xmodule.x_module import XModule
-from xmodule.raw_module import RawDescriptor
+from xmodule.xml_module import XmlDescriptor
+from xmodule.editing_module import EditingDescriptor
+from stringify import stringify_children
+from html_checker import check_html
log = logging.getLogger("mitx.courseware")
-
class HtmlModule(XModule):
def get_html(self):
return self.html
@@ -19,33 +23,108 @@ class HtmlModule(XModule):
self.html = self.definition['data']
-class HtmlDescriptor(RawDescriptor):
+class HtmlDescriptor(XmlDescriptor, EditingDescriptor):
"""
Module for putting raw html in a course
"""
mako_template = "widgets/html-edit.html"
module_class = HtmlModule
- filename_extension = "html"
+ filename_extension = "xml"
- # TODO (cpennington): Delete this method once all fall 2012 course are being
- # edited in the cms
+ # VS[compat] TODO (cpennington): Delete this method once all fall 2012 course
+ # are being edited in the cms
@classmethod
def backcompat_paths(cls, path):
- if path.endswith('.html.html'):
- path = path[:-5]
+ origpath = path
+ if path.endswith('.html.xml'):
+ path = path[:-9] + '.html' #backcompat--look for html instead of xml
candidates = []
while os.sep in path:
candidates.append(path)
_, _, path = path.partition(os.sep)
+ # also look for .html versions instead of .xml
+ if origpath.endswith('.xml'):
+ candidates.append(origpath[:-4] + '.html')
return candidates
+ # NOTE: html descriptors are special. We do not want to parse and
+ # export them ourselves, because that can break things (e.g. lxml
+ # adds body tags when it exports, but they should just be html
+ # snippets that will be included in the middle of pages.
+
@classmethod
- def file_to_xml(cls, file_object):
- parser = etree.HTMLParser()
- return etree.parse(file_object, parser).getroot()
+ def definition_loader(cls, xml_object, system):
+ '''Load a descriptor from the specified xml_object:
+
+ If there is a filename attribute, load it as a string, and
+ log a warning if it is not parseable by etree.HTMLParser.
+
+ If there is not a filename attribute, the definition is the body
+ of the xml_object, without the root tag (do not want in the
+ middle of a page)
+ '''
+ filename = xml_object.get('filename')
+ if filename is None:
+ definition_xml = copy.deepcopy(xml_object)
+ cls.clean_metadata_from_xml(definition_xml)
+ return {'data' : stringify_children(definition_xml)}
+ else:
+ filepath = cls._format_filepath(xml_object.tag, filename)
+
+ # VS[compat]
+ # TODO (cpennington): If the file doesn't exist at the right path,
+ # give the class a chance to fix it up. The file will be written out
+ # again in the correct format. This should go away once the CMS is
+ # online and has imported all current (fall 2012) courses from xml
+ if not system.resources_fs.exists(filepath):
+ candidates = cls.backcompat_paths(filepath)
+ #log.debug("candidates = {0}".format(candidates))
+ for candidate in candidates:
+ if system.resources_fs.exists(candidate):
+ filepath = candidate
+ break
+
+ try:
+ with system.resources_fs.open(filepath) as file:
+ html = file.read()
+ # Log a warning if we can't parse the file, but don't error
+ if not check_html(html):
+ log.warning("Couldn't parse html in {0}.".format(filepath))
+ return {'data' : html}
+ except (ResourceNotFoundError) as err:
+ msg = 'Unable to load file contents at path {0}: {1} '.format(
+ filepath, err)
+ log.error(msg)
+ raise
@classmethod
def split_to_file(cls, xml_object):
- # never include inline html
+ '''Never include inline html'''
return True
+
+
+ # TODO (vshnayder): make export put things in the right places.
+
+ def definition_to_xml(self, resource_fs):
+ '''If the contents are valid xml, write them to filename.xml. Otherwise,
+ write just the tag to filename.xml, and the html
+ string to filename.html.
+ '''
+ try:
+ return etree.fromstring(self.definition['data'])
+ except etree.XMLSyntaxError:
+ pass
+
+ # Not proper format. Write html to file, return an empty tag
+ filepath = u'{category}/{name}.html'.format(category=self.category,
+ name=self.name)
+
+ resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
+ with resource_fs.open(filepath, 'w') as file:
+ file.write(self.definition['data'])
+
+ elt = etree.Element('html')
+ elt.set("filename", self.name)
+ return elt
+
diff --git a/common/lib/xmodule/xmodule/xml_module.py b/common/lib/xmodule/xmodule/xml_module.py
index 5786dbd227..188df21130 100644
--- a/common/lib/xmodule/xmodule/xml_module.py
+++ b/common/lib/xmodule/xmodule/xml_module.py
@@ -162,6 +162,52 @@ class XmlDescriptor(XModuleDescriptor):
"""
return etree.parse(file_object).getroot()
+ @classmethod
+ def definition_loader(cls, xml_object, system, location):
+ '''Load a descriptor definition from the specified xml_object.
+ Subclasses should not need to override this except in special
+ cases (e.g. html module)'''
+
+ filename = xml_object.get('filename')
+ if filename is None:
+ definition_xml = copy.deepcopy(xml_object)
+ else:
+ filepath = cls._format_filepath(xml_object.tag, filename)
+
+ # VS[compat]
+ # TODO (cpennington): If the file doesn't exist at the right path,
+ # give the class a chance to fix it up. The file will be written out again
+ # in the correct format.
+ # This should go away once the CMS is online and has imported all current (fall 2012)
+ # courses from xml
+ if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
+ candidates = cls.backcompat_paths(filepath)
+ for candidate in candidates:
+ if system.resources_fs.exists(candidate):
+ filepath = candidate
+ break
+
+ try:
+ with system.resources_fs.open(filepath) as file:
+ definition_xml = cls.file_to_xml(file)
+ except (ResourceNotFoundError, etree.XMLSyntaxError):
+ msg = 'Unable to load file contents at path %s for item %s' % (
+ filepath, location.url())
+
+ log.exception(msg)
+ system.error_handler(msg)
+ # if error_handler didn't reraise, work around problem.
+ error_elem = etree.Element('error')
+ message_elem = etree.SubElement(error_elem, 'error_message')
+ message_elem.text = msg
+ stack_elem = etree.SubElement(error_elem, 'stack_trace')
+ stack_elem.text = traceback.format_exc()
+ return {'data': etree.tostring(error_elem)}
+
+ cls.clean_metadata_from_xml(definition_xml)
+ return cls.definition_from_xml(definition_xml, system)
+
+
@classmethod
def from_xml(cls, xml_data, system, org=None, course=None):
"""
@@ -191,44 +237,8 @@ class XmlDescriptor(XModuleDescriptor):
metadata[attr_map.metadata_key] = attr_map.to_metadata(val)
return metadata
- def definition_loader():
- filename = xml_object.get('filename')
- if filename is None:
- definition_xml = copy.deepcopy(xml_object)
- else:
- filepath = cls._format_filepath(xml_object.tag, filename)
-
- # VS[compat]
- # TODO (cpennington): If the file doesn't exist at the right path,
- # give the class a chance to fix it up. The file will be written out again
- # in the correct format.
- # This should go away once the CMS is online and has imported all current (fall 2012)
- # courses from xml
- if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
- candidates = cls.backcompat_paths(filepath)
- for candidate in candidates:
- if system.resources_fs.exists(candidate):
- filepath = candidate
- break
-
- try:
- with system.resources_fs.open(filepath) as file:
- definition_xml = cls.file_to_xml(file)
- except (ResourceNotFoundError, etree.XMLSyntaxError):
- msg = 'Unable to load file contents at path %s for item %s' % (filepath, location.url())
- log.exception(msg)
- system.error_handler(msg)
- # if error_handler didn't reraise, work around problem.
- error_elem = etree.Element('error')
- message_elem = etree.SubElement(error_elem, 'error_message')
- message_elem.text = msg
- stack_elem = etree.SubElement(error_elem, 'stack_trace')
- stack_elem.text = traceback.format_exc()
- return {'data': etree.tostring(error_elem)}
-
- cls.clean_metadata_from_xml(definition_xml)
- return cls.definition_from_xml(definition_xml, system)
-
+ definition = cls.definition_loader(xml_object, system, location)
+ metadata = metadata_loader()
# VS[compat] -- just have the url_name lookup once translation is done
slug = xml_object.get('url_name', xml_object.get('slug'))
return cls(
@@ -283,7 +293,7 @@ class XmlDescriptor(XModuleDescriptor):
# Write it to a file if necessary
if self.split_to_file(xml_object):
- # Put this object in it's own file
+ # Put this object in its own file
filepath = self.__class__._format_filepath(self.category, self.name)
resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
with resource_fs.open(filepath, 'w') as file:
diff --git a/lms/djangoapps/courseware/management/commands/clean_xml.py b/lms/djangoapps/courseware/management/commands/clean_xml.py
index 7523fd8373..41f0c39cde 100644
--- a/lms/djangoapps/courseware/management/commands/clean_xml.py
+++ b/lms/djangoapps/courseware/management/commands/clean_xml.py
@@ -143,6 +143,7 @@ def check_roundtrip(course_dir):
# dircmp doesn't do recursive diffs.
# diff = dircmp(course_dir, export_dir, ignore=[], hide=[])
print "======== Roundtrip diff: ========="
+ sys.stdout.flush() # needed to make diff appear in the right place
os.system("diff -r {0} {1}".format(course_dir, export_dir))
print "======== ideally there is no diff above this ======="