* html files are now stored as follows: If the html file is valid xml, store as html/stuff.xml If it's not, store as html/stuff.xml, which contains <html meta1="..." filename="stuff.html">, and html/stuff.html, which actually contains the contents. Warn if the contents are not parseable with lxml's html parser, but don't error. * for parseable html, strip out the html tag when storing, so that it isn't rendered into the middle of a page * lots of backcompat to deal with paths. Can go away soon. * fix output ordering in clean_xml
10 lines
377 B
Python
10 lines
377 B
Python
from nose.tools import assert_equals
|
|
from lxml import etree
|
|
from stringify import stringify_children
|
|
|
|
def test_stringify():
|
|
html = '''<html a="b" foo="bar">Hi <div x="foo">there <span>Bruce</span><b>!</b></div></html>'''
|
|
xml = etree.fromstring(html)
|
|
out = stringify_children(xml)
|
|
assert_equals(out, '''Hi <div x="foo">there <span>Bruce</span><b>!</b></div>''')
|