116 lines
3.3 KiB
Python
Executable File
116 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
"""
|
|
Victor's xml cleanup script. A big pile of useful hacks. Do not use
|
|
without carefully reading the code and deciding that this is what you want.
|
|
|
|
In particular, the remove-meta option is only intended to be used after pulling out a policy
|
|
using the metadata_to_json management command.
|
|
"""
|
|
|
|
import os
|
|
import fnmatch
|
|
import re
|
|
import sys
|
|
from lxml import etree
|
|
from collections import defaultdict
|
|
|
|
INVALID_CHARS = re.compile(r"[^\w.-]")
|
|
|
|
|
|
def clean(value):
|
|
"""
|
|
Return value, made into a form legal for locations
|
|
"""
|
|
return re.sub('_+', '_', INVALID_CHARS.sub('_', value))
|
|
|
|
|
|
# category -> set of url_names for that category that we've already seen
|
|
used_names = defaultdict(set)
|
|
|
|
|
|
def clean_unique(category, name):
|
|
cleaned = clean(name)
|
|
if cleaned not in used_names[category]:
|
|
used_names[category].add(cleaned)
|
|
return cleaned
|
|
x = 1
|
|
while cleaned + str(x) in used_names[category]:
|
|
x += 1
|
|
|
|
# Found one!
|
|
cleaned = cleaned + str(x)
|
|
used_names[category].add(cleaned)
|
|
return cleaned
|
|
|
|
|
|
def cleanup(filepath, remove_meta):
|
|
# Keys that are exported to the policy file, and so
|
|
# can be removed from the xml afterward
|
|
to_remove = ('format', 'display_name',
|
|
'graceperiod', 'showanswer', 'rerandomize',
|
|
'start', 'due', 'graded', 'hide_from_toc',
|
|
'ispublic', 'xqa_key')
|
|
|
|
try:
|
|
print "Cleaning {0}".format(filepath)
|
|
with open(filepath) as f:
|
|
parser = etree.XMLParser(remove_comments=False)
|
|
xml = etree.parse(filepath, parser=parser)
|
|
except:
|
|
print "Error parsing file {0}".format(filepath)
|
|
return
|
|
|
|
for node in xml.iter(tag=etree.Element):
|
|
attrs = node.attrib
|
|
if 'url_name' in attrs:
|
|
used_names[node.tag].add(attrs['url_name'])
|
|
if 'name' in attrs:
|
|
# Replace name with an identical display_name, and a unique url_name
|
|
name = attrs['name']
|
|
attrs['display_name'] = name
|
|
attrs['url_name'] = clean_unique(node.tag, name)
|
|
del attrs['name']
|
|
|
|
if 'url_name' in attrs and 'slug' in attrs:
|
|
print "WARNING: {0} has both slug and url_name".format(node)
|
|
|
|
if ('url_name' in attrs and 'filename' in attrs and
|
|
len(attrs) == 2 and attrs['url_name'] == attrs['filename']):
|
|
# This is a pointer tag in disguise. Get rid of the filename.
|
|
print 'turning {0}.{1} into a pointer tag'.format(node.tag, attrs['url_name'])
|
|
del attrs['filename']
|
|
|
|
if remove_meta:
|
|
for attr in to_remove:
|
|
if attr in attrs:
|
|
del attrs[attr]
|
|
|
|
with open(filepath, "w") as f:
|
|
f.write(etree.tostring(xml))
|
|
|
|
|
|
def find_replace(directory, filePattern, remove_meta):
|
|
for path, dirs, files in os.walk(os.path.abspath(directory)):
|
|
for filename in fnmatch.filter(files, filePattern):
|
|
filepath = os.path.join(path, filename)
|
|
cleanup(filepath, remove_meta)
|
|
|
|
|
|
def main(args):
|
|
usage = "xml_cleanup [dir] [remove-meta]"
|
|
n = len(args)
|
|
if n < 1 or n > 2 or (n == 2 and args[1] != 'remove-meta'):
|
|
print usage
|
|
return
|
|
|
|
remove_meta = False
|
|
if n == 2:
|
|
remove_meta = True
|
|
|
|
find_replace(args[0], '*.xml', remove_meta)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main(sys.argv[1:])
|