Fix html file handling.
* html files are now stored as follows: If the html file is valid xml, store as html/stuff.xml If it's not, store as html/stuff.xml, which contains <html meta1="..." filename="stuff.html">, and html/stuff.html, which actually contains the contents. Warn if the contents are not parseable with lxml's html parser, but don't error. * for parseable html, strip out the html tag when storing, so that it isn't rendered into the middle of a page * lots of backcompat to deal with paths. Can go away soon. * fix output ordering in clean_xml
This commit is contained in:
14
common/lib/xmodule/html_checker.py
Normal file
14
common/lib/xmodule/html_checker.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from lxml import etree
|
||||
|
||||
def check_html(html):
|
||||
'''
|
||||
Check whether the passed in html string can be parsed by lxml.
|
||||
Return bool success.
|
||||
'''
|
||||
parser = etree.HTMLParser()
|
||||
try:
|
||||
etree.fromstring(html, parser)
|
||||
return True
|
||||
except Exception as err:
|
||||
pass
|
||||
return False
|
||||
20
common/lib/xmodule/stringify.py
Normal file
20
common/lib/xmodule/stringify.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from itertools import chain
|
||||
from lxml import etree
|
||||
|
||||
def stringify_children(node):
|
||||
'''
|
||||
Return all contents of an xml tree, without the outside tags.
|
||||
e.g. if node is parse of
|
||||
"<html a="b" foo="bar">Hi <div>there <span>Bruce</span><b>!</b></div><html>"
|
||||
should return
|
||||
"Hi <div>there <span>Bruce</span><b>!</b></div>"
|
||||
|
||||
fixed from
|
||||
http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
|
||||
'''
|
||||
parts = ([node.text] +
|
||||
list(chain(*([etree.tostring(c), c.tail]
|
||||
for c in node.getchildren())
|
||||
)))
|
||||
# filter removes possible Nones in texts and tails
|
||||
return ''.join(filter(None, parts))
|
||||
9
common/lib/xmodule/tests/test_stringify.py
Normal file
9
common/lib/xmodule/tests/test_stringify.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from nose.tools import assert_equals
|
||||
from lxml import etree
|
||||
from stringify import stringify_children
|
||||
|
||||
def test_stringify():
|
||||
html = '''<html a="b" foo="bar">Hi <div x="foo">there <span>Bruce</span><b>!</b></div></html>'''
|
||||
xml = etree.fromstring(html)
|
||||
out = stringify_children(xml)
|
||||
assert_equals(out, '''Hi <div x="foo">there <span>Bruce</span><b>!</b></div>''')
|
||||
@@ -1,13 +1,17 @@
|
||||
import copy
|
||||
from fs.errors import ResourceNotFoundError
|
||||
import logging
|
||||
import os
|
||||
from lxml import etree
|
||||
|
||||
from xmodule.x_module import XModule
|
||||
from xmodule.raw_module import RawDescriptor
|
||||
from xmodule.xml_module import XmlDescriptor
|
||||
from xmodule.editing_module import EditingDescriptor
|
||||
from stringify import stringify_children
|
||||
from html_checker import check_html
|
||||
|
||||
log = logging.getLogger("mitx.courseware")
|
||||
|
||||
|
||||
class HtmlModule(XModule):
|
||||
def get_html(self):
|
||||
return self.html
|
||||
@@ -19,33 +23,108 @@ class HtmlModule(XModule):
|
||||
self.html = self.definition['data']
|
||||
|
||||
|
||||
class HtmlDescriptor(RawDescriptor):
|
||||
class HtmlDescriptor(XmlDescriptor, EditingDescriptor):
|
||||
"""
|
||||
Module for putting raw html in a course
|
||||
"""
|
||||
mako_template = "widgets/html-edit.html"
|
||||
module_class = HtmlModule
|
||||
filename_extension = "html"
|
||||
filename_extension = "xml"
|
||||
|
||||
# TODO (cpennington): Delete this method once all fall 2012 course are being
|
||||
# edited in the cms
|
||||
# VS[compat] TODO (cpennington): Delete this method once all fall 2012 course
|
||||
# are being edited in the cms
|
||||
@classmethod
|
||||
def backcompat_paths(cls, path):
|
||||
if path.endswith('.html.html'):
|
||||
path = path[:-5]
|
||||
origpath = path
|
||||
if path.endswith('.html.xml'):
|
||||
path = path[:-9] + '.html' #backcompat--look for html instead of xml
|
||||
candidates = []
|
||||
while os.sep in path:
|
||||
candidates.append(path)
|
||||
_, _, path = path.partition(os.sep)
|
||||
|
||||
# also look for .html versions instead of .xml
|
||||
if origpath.endswith('.xml'):
|
||||
candidates.append(origpath[:-4] + '.html')
|
||||
return candidates
|
||||
|
||||
# NOTE: html descriptors are special. We do not want to parse and
|
||||
# export them ourselves, because that can break things (e.g. lxml
|
||||
# adds body tags when it exports, but they should just be html
|
||||
# snippets that will be included in the middle of pages.
|
||||
|
||||
@classmethod
|
||||
def file_to_xml(cls, file_object):
|
||||
parser = etree.HTMLParser()
|
||||
return etree.parse(file_object, parser).getroot()
|
||||
def definition_loader(cls, xml_object, system):
|
||||
'''Load a descriptor from the specified xml_object:
|
||||
|
||||
If there is a filename attribute, load it as a string, and
|
||||
log a warning if it is not parseable by etree.HTMLParser.
|
||||
|
||||
If there is not a filename attribute, the definition is the body
|
||||
of the xml_object, without the root tag (do not want <html> in the
|
||||
middle of a page)
|
||||
'''
|
||||
filename = xml_object.get('filename')
|
||||
if filename is None:
|
||||
definition_xml = copy.deepcopy(xml_object)
|
||||
cls.clean_metadata_from_xml(definition_xml)
|
||||
return {'data' : stringify_children(definition_xml)}
|
||||
else:
|
||||
filepath = cls._format_filepath(xml_object.tag, filename)
|
||||
|
||||
# VS[compat]
|
||||
# TODO (cpennington): If the file doesn't exist at the right path,
|
||||
# give the class a chance to fix it up. The file will be written out
|
||||
# again in the correct format. This should go away once the CMS is
|
||||
# online and has imported all current (fall 2012) courses from xml
|
||||
if not system.resources_fs.exists(filepath):
|
||||
candidates = cls.backcompat_paths(filepath)
|
||||
#log.debug("candidates = {0}".format(candidates))
|
||||
for candidate in candidates:
|
||||
if system.resources_fs.exists(candidate):
|
||||
filepath = candidate
|
||||
break
|
||||
|
||||
try:
|
||||
with system.resources_fs.open(filepath) as file:
|
||||
html = file.read()
|
||||
# Log a warning if we can't parse the file, but don't error
|
||||
if not check_html(html):
|
||||
log.warning("Couldn't parse html in {0}.".format(filepath))
|
||||
return {'data' : html}
|
||||
except (ResourceNotFoundError) as err:
|
||||
msg = 'Unable to load file contents at path {0}: {1} '.format(
|
||||
filepath, err)
|
||||
log.error(msg)
|
||||
raise
|
||||
|
||||
@classmethod
|
||||
def split_to_file(cls, xml_object):
|
||||
# never include inline html
|
||||
'''Never include inline html'''
|
||||
return True
|
||||
|
||||
|
||||
# TODO (vshnayder): make export put things in the right places.
|
||||
|
||||
def definition_to_xml(self, resource_fs):
|
||||
'''If the contents are valid xml, write them to filename.xml. Otherwise,
|
||||
write just the <html filename=""> tag to filename.xml, and the html
|
||||
string to filename.html.
|
||||
'''
|
||||
try:
|
||||
return etree.fromstring(self.definition['data'])
|
||||
except etree.XMLSyntaxError:
|
||||
pass
|
||||
|
||||
# Not proper format. Write html to file, return an empty tag
|
||||
filepath = u'{category}/{name}.html'.format(category=self.category,
|
||||
name=self.name)
|
||||
|
||||
resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
|
||||
with resource_fs.open(filepath, 'w') as file:
|
||||
file.write(self.definition['data'])
|
||||
|
||||
elt = etree.Element('html')
|
||||
elt.set("filename", self.name)
|
||||
return elt
|
||||
|
||||
|
||||
@@ -162,6 +162,52 @@ class XmlDescriptor(XModuleDescriptor):
|
||||
"""
|
||||
return etree.parse(file_object).getroot()
|
||||
|
||||
@classmethod
|
||||
def definition_loader(cls, xml_object, system, location):
|
||||
'''Load a descriptor definition from the specified xml_object.
|
||||
Subclasses should not need to override this except in special
|
||||
cases (e.g. html module)'''
|
||||
|
||||
filename = xml_object.get('filename')
|
||||
if filename is None:
|
||||
definition_xml = copy.deepcopy(xml_object)
|
||||
else:
|
||||
filepath = cls._format_filepath(xml_object.tag, filename)
|
||||
|
||||
# VS[compat]
|
||||
# TODO (cpennington): If the file doesn't exist at the right path,
|
||||
# give the class a chance to fix it up. The file will be written out again
|
||||
# in the correct format.
|
||||
# This should go away once the CMS is online and has imported all current (fall 2012)
|
||||
# courses from xml
|
||||
if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
|
||||
candidates = cls.backcompat_paths(filepath)
|
||||
for candidate in candidates:
|
||||
if system.resources_fs.exists(candidate):
|
||||
filepath = candidate
|
||||
break
|
||||
|
||||
try:
|
||||
with system.resources_fs.open(filepath) as file:
|
||||
definition_xml = cls.file_to_xml(file)
|
||||
except (ResourceNotFoundError, etree.XMLSyntaxError):
|
||||
msg = 'Unable to load file contents at path %s for item %s' % (
|
||||
filepath, location.url())
|
||||
|
||||
log.exception(msg)
|
||||
system.error_handler(msg)
|
||||
# if error_handler didn't reraise, work around problem.
|
||||
error_elem = etree.Element('error')
|
||||
message_elem = etree.SubElement(error_elem, 'error_message')
|
||||
message_elem.text = msg
|
||||
stack_elem = etree.SubElement(error_elem, 'stack_trace')
|
||||
stack_elem.text = traceback.format_exc()
|
||||
return {'data': etree.tostring(error_elem)}
|
||||
|
||||
cls.clean_metadata_from_xml(definition_xml)
|
||||
return cls.definition_from_xml(definition_xml, system)
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_xml(cls, xml_data, system, org=None, course=None):
|
||||
"""
|
||||
@@ -191,44 +237,8 @@ class XmlDescriptor(XModuleDescriptor):
|
||||
metadata[attr_map.metadata_key] = attr_map.to_metadata(val)
|
||||
return metadata
|
||||
|
||||
def definition_loader():
|
||||
filename = xml_object.get('filename')
|
||||
if filename is None:
|
||||
definition_xml = copy.deepcopy(xml_object)
|
||||
else:
|
||||
filepath = cls._format_filepath(xml_object.tag, filename)
|
||||
|
||||
# VS[compat]
|
||||
# TODO (cpennington): If the file doesn't exist at the right path,
|
||||
# give the class a chance to fix it up. The file will be written out again
|
||||
# in the correct format.
|
||||
# This should go away once the CMS is online and has imported all current (fall 2012)
|
||||
# courses from xml
|
||||
if not system.resources_fs.exists(filepath) and hasattr(cls, 'backcompat_paths'):
|
||||
candidates = cls.backcompat_paths(filepath)
|
||||
for candidate in candidates:
|
||||
if system.resources_fs.exists(candidate):
|
||||
filepath = candidate
|
||||
break
|
||||
|
||||
try:
|
||||
with system.resources_fs.open(filepath) as file:
|
||||
definition_xml = cls.file_to_xml(file)
|
||||
except (ResourceNotFoundError, etree.XMLSyntaxError):
|
||||
msg = 'Unable to load file contents at path %s for item %s' % (filepath, location.url())
|
||||
log.exception(msg)
|
||||
system.error_handler(msg)
|
||||
# if error_handler didn't reraise, work around problem.
|
||||
error_elem = etree.Element('error')
|
||||
message_elem = etree.SubElement(error_elem, 'error_message')
|
||||
message_elem.text = msg
|
||||
stack_elem = etree.SubElement(error_elem, 'stack_trace')
|
||||
stack_elem.text = traceback.format_exc()
|
||||
return {'data': etree.tostring(error_elem)}
|
||||
|
||||
cls.clean_metadata_from_xml(definition_xml)
|
||||
return cls.definition_from_xml(definition_xml, system)
|
||||
|
||||
definition = cls.definition_loader(xml_object, system, location)
|
||||
metadata = metadata_loader()
|
||||
# VS[compat] -- just have the url_name lookup once translation is done
|
||||
slug = xml_object.get('url_name', xml_object.get('slug'))
|
||||
return cls(
|
||||
@@ -283,7 +293,7 @@ class XmlDescriptor(XModuleDescriptor):
|
||||
|
||||
# Write it to a file if necessary
|
||||
if self.split_to_file(xml_object):
|
||||
# Put this object in it's own file
|
||||
# Put this object in its own file
|
||||
filepath = self.__class__._format_filepath(self.category, self.name)
|
||||
resource_fs.makedir(os.path.dirname(filepath), allow_recreate=True)
|
||||
with resource_fs.open(filepath, 'w') as file:
|
||||
|
||||
@@ -143,6 +143,7 @@ def check_roundtrip(course_dir):
|
||||
# dircmp doesn't do recursive diffs.
|
||||
# diff = dircmp(course_dir, export_dir, ignore=[], hide=[])
|
||||
print "======== Roundtrip diff: ========="
|
||||
sys.stdout.flush() # needed to make diff appear in the right place
|
||||
os.system("diff -r {0} {1}".format(course_dir, export_dir))
|
||||
print "======== ideally there is no diff above this ======="
|
||||
|
||||
|
||||
Reference in New Issue
Block a user