diff --git a/common/lib/xmodule/html_checker.py b/common/lib/xmodule/html_checker.py new file mode 100644 index 0000000000..5e6b417d28 --- /dev/null +++ b/common/lib/xmodule/html_checker.py @@ -0,0 +1,14 @@ +from lxml import etree + +def check_html(html): + ''' + Check whether the passed in html string can be parsed by lxml. + Return bool success. + ''' + parser = etree.HTMLParser() + try: + etree.fromstring(html, parser) + return True + except Exception as err: + pass + return False diff --git a/common/lib/xmodule/stringify.py b/common/lib/xmodule/stringify.py new file mode 100644 index 0000000000..dad964140f --- /dev/null +++ b/common/lib/xmodule/stringify.py @@ -0,0 +1,20 @@ +from itertools import chain +from lxml import etree + +def stringify_children(node): + ''' + Return all contents of an xml tree, without the outside tags. + e.g. if node is parse of + "Hi
there Bruce!
" + should return + "Hi
there Bruce!
" + + fixed from + http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml + ''' + parts = ([node.text] + + list(chain(*([etree.tostring(c), c.tail] + for c in node.getchildren()) + ))) + # filter removes possible Nones in texts and tails + return ''.join(filter(None, parts)) diff --git a/common/lib/xmodule/tests/test_stringify.py b/common/lib/xmodule/tests/test_stringify.py new file mode 100644 index 0000000000..62d7683886 --- /dev/null +++ b/common/lib/xmodule/tests/test_stringify.py @@ -0,0 +1,9 @@ +from nose.tools import assert_equals +from lxml import etree +from stringify import stringify_children + +def test_stringify(): + html = '''Hi
there Bruce!
''' + xml = etree.fromstring(html) + out = stringify_children(xml) + assert_equals(out, '''Hi
there Bruce!
''') diff --git a/common/lib/xmodule/xmodule/html_module.py b/common/lib/xmodule/xmodule/html_module.py index b9bc34aed6..22d3bf163f 100644 --- a/common/lib/xmodule/xmodule/html_module.py +++ b/common/lib/xmodule/xmodule/html_module.py @@ -1,13 +1,17 @@ +import copy +from fs.errors import ResourceNotFoundError import logging import os from lxml import etree from xmodule.x_module import XModule -from xmodule.raw_module import RawDescriptor +from xmodule.xml_module import XmlDescriptor +from xmodule.editing_module import EditingDescriptor +from stringify import stringify_children +from html_checker import check_html log = logging.getLogger("mitx.courseware") - class HtmlModule(XModule): def get_html(self): return self.html @@ -19,33 +23,108 @@ class HtmlModule(XModule): self.html = self.definition['data'] -class HtmlDescriptor(RawDescriptor): +class HtmlDescriptor(XmlDescriptor, EditingDescriptor): """ Module for putting raw html in a course """ mako_template = "widgets/html-edit.html" module_class = HtmlModule - filename_extension = "html" + filename_extension = "xml" - # TODO (cpennington): Delete this method once all fall 2012 course are being - # edited in the cms + # VS[compat] TODO (cpennington): Delete this method once all fall 2012 course + # are being edited in the cms @classmethod def backcompat_paths(cls, path): - if path.endswith('.html.html'): - path = path[:-5] + origpath = path + if path.endswith('.html.xml'): + path = path[:-9] + '.html' #backcompat--look for html instead of xml candidates = [] while os.sep in path: candidates.append(path) _, _, path = path.partition(os.sep) + # also look for .html versions instead of .xml + if origpath.endswith('.xml'): + candidates.append(origpath[:-4] + '.html') return candidates + # NOTE: html descriptors are special. We do not want to parse and + # export them ourselves, because that can break things (e.g. lxml + # adds body tags when it exports, but they should just be html + # snippets that will be included in the middle of pages. + @classmethod - def file_to_xml(cls, file_object): - parser = etree.HTMLParser() - return etree.parse(file_object, parser).getroot() + def definition_loader(cls, xml_object, system): + '''Load a descriptor from the specified xml_object: + + If there is a filename attribute, load it as a string, and + log a warning if it is not parseable by etree.HTMLParser. + + If there is not a filename attribute, the definition is the body + of the xml_object, without the root tag (do not want in the + middle of a page) + ''' + filename = xml_object.get('filename') + if filename is None: + definition_xml = copy.deepcopy(xml_object) + cls.clean_metadata_from_xml(definition_xml) + return {'data' : stringify_children(definition_xml)} + else: + filepath = cls._format_filepath(xml_object.tag, filename) + + # VS[compat] + # TODO (cpennington): If the file doesn't exist at the right path, + # give the class a chance to fix it up. The file will be written out + # again in the correct format. This should go away once the CMS is + # online and has imported all current (fall 2012) courses from xml + if not system.resources_fs.exists(filepath): + candidates = cls.backcompat_paths(filepath) + #log.debug("candidates = {0}".format(candidates)) + for candidate in candidates: + if system.resources_fs.exists(candidate): + filepath = candidate + break + + try: + with system.resources_fs.open(filepath) as file: + html = file.read() + # Log a warning if we can't parse the file, but don't error + if not check_html(html): + log.warning("Couldn't parse html in {0}.".format(filepath)) + return {'data' : html} + except (ResourceNotFoundError) as err: + msg = 'Unable to load file contents at path {0}: {1} '.format( + filepath, err) + log.error(msg) + raise @classmethod def split_to_file(cls, xml_object): - # never include inline html + '''Never include inline html''' return True + + + # TODO (vshnayder): make export put things in the right places. + + def definition_to_xml(self, resource_fs): + '''If the contents are valid xml, write them to filename.xml. Otherwise, + write just the tag to filename.xml, and the html + string to filename.html. + ''' + try: + return etree.fromstring(self.definition['data']) + except etree.XMLSyntaxError: + pass + + # Not proper format. Write html to file, return an empty tag + filepath = u'{category}/{name}.html'.format(category=self.category, + name=self.name) + + resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True) + with resource_fs.open(filepath, 'w') as file: + file.write(self.definition['data']) + + elt = etree.Element('html') + elt.set("filename", self.name) + return elt + diff --git a/common/lib/xmodule/xmodule/xml_module.py b/common/lib/xmodule/xmodule/xml_module.py index 5786dbd227..188df21130 100644 --- a/common/lib/xmodule/xmodule/xml_module.py +++ b/common/lib/xmodule/xmodule/xml_module.py @@ -162,6 +162,52 @@ class XmlDescriptor(XModuleDescriptor): """ return etree.parse(file_object).getroot() + @classmethod + def definition_loader(cls, xml_object, system, location): + '''Load a descriptor definition from the specified xml_object. + Subclasses should not need to override this except in special + cases (e.g. html module)''' + + filename = xml_object.get('filename') + if filename is None: + definition_xml = copy.deepcopy(xml_object) + else: + filepath = cls._format_filepath(xml_object.tag, filename) + + # VS[compat] + # TODO (cpennington): If the file doesn't exist at the right path, + # give the class a chance to fix it up. The file will be written out again + # in the correct format. + # This should go away once the CMS is online and has imported all current (fall 2012) + # courses from xml + if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'): + candidates = cls.backcompat_paths(filepath) + for candidate in candidates: + if system.resources_fs.exists(candidate): + filepath = candidate + break + + try: + with system.resources_fs.open(filepath) as file: + definition_xml = cls.file_to_xml(file) + except (ResourceNotFoundError, etree.XMLSyntaxError): + msg = 'Unable to load file contents at path %s for item %s' % ( + filepath, location.url()) + + log.exception(msg) + system.error_handler(msg) + # if error_handler didn't reraise, work around problem. + error_elem = etree.Element('error') + message_elem = etree.SubElement(error_elem, 'error_message') + message_elem.text = msg + stack_elem = etree.SubElement(error_elem, 'stack_trace') + stack_elem.text = traceback.format_exc() + return {'data': etree.tostring(error_elem)} + + cls.clean_metadata_from_xml(definition_xml) + return cls.definition_from_xml(definition_xml, system) + + @classmethod def from_xml(cls, xml_data, system, org=None, course=None): """ @@ -191,44 +237,8 @@ class XmlDescriptor(XModuleDescriptor): metadata[attr_map.metadata_key] = attr_map.to_metadata(val) return metadata - def definition_loader(): - filename = xml_object.get('filename') - if filename is None: - definition_xml = copy.deepcopy(xml_object) - else: - filepath = cls._format_filepath(xml_object.tag, filename) - - # VS[compat] - # TODO (cpennington): If the file doesn't exist at the right path, - # give the class a chance to fix it up. The file will be written out again - # in the correct format. - # This should go away once the CMS is online and has imported all current (fall 2012) - # courses from xml - if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'): - candidates = cls.backcompat_paths(filepath) - for candidate in candidates: - if system.resources_fs.exists(candidate): - filepath = candidate - break - - try: - with system.resources_fs.open(filepath) as file: - definition_xml = cls.file_to_xml(file) - except (ResourceNotFoundError, etree.XMLSyntaxError): - msg = 'Unable to load file contents at path %s for item %s' % (filepath, location.url()) - log.exception(msg) - system.error_handler(msg) - # if error_handler didn't reraise, work around problem. - error_elem = etree.Element('error') - message_elem = etree.SubElement(error_elem, 'error_message') - message_elem.text = msg - stack_elem = etree.SubElement(error_elem, 'stack_trace') - stack_elem.text = traceback.format_exc() - return {'data': etree.tostring(error_elem)} - - cls.clean_metadata_from_xml(definition_xml) - return cls.definition_from_xml(definition_xml, system) - + definition = cls.definition_loader(xml_object, system, location) + metadata = metadata_loader() # VS[compat] -- just have the url_name lookup once translation is done slug = xml_object.get('url_name', xml_object.get('slug')) return cls( @@ -283,7 +293,7 @@ class XmlDescriptor(XModuleDescriptor): # Write it to a file if necessary if self.split_to_file(xml_object): - # Put this object in it's own file + # Put this object in its own file filepath = self.__class__._format_filepath(self.category, self.name) resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True) with resource_fs.open(filepath, 'w') as file: diff --git a/lms/djangoapps/courseware/management/commands/clean_xml.py b/lms/djangoapps/courseware/management/commands/clean_xml.py index 7523fd8373..41f0c39cde 100644 --- a/lms/djangoapps/courseware/management/commands/clean_xml.py +++ b/lms/djangoapps/courseware/management/commands/clean_xml.py @@ -143,6 +143,7 @@ def check_roundtrip(course_dir): # dircmp doesn't do recursive diffs. # diff = dircmp(course_dir, export_dir, ignore=[], hide=[]) print "======== Roundtrip diff: =========" + sys.stdout.flush() # needed to make diff appear in the right place os.system("diff -r {0} {1}".format(course_dir, export_dir)) print "======== ideally there is no diff above this ======="