From 63b3999ef1281d4e9c9ee705756797a6703ab8bf Mon Sep 17 00:00:00 2001
From: Victor Shnayder <victor@mitx.mit.edu>
Date: Wed, 20 Jun 2012 19:16:55 -0400
Subject: [PATCH] Big refactor of content_parser

* Separate out all xml processing code from django code
* Preparing to move the content parsing parts into common/lib/xmodule (?)
---
 lms/djangoapps/courseware/content_parser.py | 371 ++++++++++++--------
 1 file changed, 220 insertions(+), 151 deletions(-)
diff --git a/lms/djangoapps/courseware/content_parser.py b/lms/djangoapps/courseware/content_parser.py
index 4f96db5284..5ac9f87856 100644
--- a/lms/djangoapps/courseware/content_parser.py
+++ b/lms/djangoapps/courseware/content_parser.py
@@ -38,29 +38,6 @@ log = logging.getLogger("mitx.courseware")
 def format_url_params(params):
     return [ urllib.quote(string.replace(' ','_')) for string in params ]
 
-def xpath(xml, query_string, **args):
-    ''' Safe xpath query into an xml tree:
-        * xml is the tree.
-        * query_string is the query
-        * args are the parameters. Substitute for {params}. 
-        We should remove this with the move to lxml. 
-        We should also use lxml argument passing. '''
-    doc = etree.fromstring(xml)
-    #print type(doc)
-    def escape(x):
-        # TODO: This should escape the string. For now, we just assume it's made of valid characters. 
-        # Couldn't figure out how to escape for lxml in a few quick Googles
-        valid_chars="".join(map(chr, range(ord('a'),ord('z')+1)+range(ord('A'),ord('Z')+1)+range(ord('0'), ord('9')+1)))+"_ "
-        for e in x:
-            if e not in valid_chars:
-                raise Exception("Invalid char in xpath expression. TODO: Escape")
-        return x
-
-    args=dict( ((k, escape(args[k])) for k in args) )
-    #print args
-    results = doc.xpath(query_string.format(**args))
-    return results
-
 def xpath_remove(tree, path):
     ''' Remove all items matching path from lxml tree.  Works in
         place.'''
@@ -69,35 +46,34 @@ def xpath_remove(tree, path):
         item.getparent().remove(item)
     return tree
 
-if __name__=='__main__':
-    print xpath('<html><problem name="Bob"></problem></html>', '/{search}/problem[@name="{name}"]', 
-                search='html', name="Bob")
-
 def id_tag(course):
     ''' Tag all course elements with unique IDs '''
     default_ids = xmodule.get_default_ids()
 
     # Tag elements with unique IDs
-    elements = course.xpath("|".join(['//'+c for c in default_ids]))
+    elements = course.xpath("|".join('//' + c for c in default_ids))
     for elem in elements:
         if elem.get('id'):
             pass
         elif elem.get(default_ids[elem.tag]):
-            new_id = elem.get(default_ids[elem.tag]) 
-            new_id = "".join([a for a in new_id if a.isalnum()]) # Convert to alphanumeric
-            # Without this, a conflict may occur between an hmtl or youtube id
+            new_id = elem.get(default_ids[elem.tag])
+            # Convert to alphanumeric
+            new_id = "".join(a for a in new_id if a.isalnum()) 
+
+            # Without this, a conflict may occur between an html or youtube id
             new_id = default_ids[elem.tag] + new_id
             elem.set('id', new_id)
         else:
-            elem.set('id', "id"+fasthash(etree.tostring(elem)))
+            elem.set('id', "id" + fasthash(etree.tostring(elem)))
             
 def propogate_downward_tag(element, attribute_name, parent_attribute = None):
     ''' This call is to pass down an attribute to all children. If an element
     has this attribute, it will be "inherited" by all of its children. If a
     child (A) already has that attribute, A will keep the same attribute and
     all of A's children will inherit A's attribute. This is a recursive call.'''
-    
-    if (parent_attribute is None): #This is the entry call. Select all elements with this attribute
+
+    if (parent_attribute is None):
+        #This is the entry call. Select all elements with this attribute
         all_attributed_elements = element.xpath("//*[@" + attribute_name +"]")
         for attributed_element in all_attributed_elements:
             attribute_value = attributed_element.get(attribute_name)
@@ -118,6 +94,159 @@ def propogate_downward_tag(element, attribute_name, parent_attribute = None):
             #to its children later.
             return
 
+
+def course_xml_process(tree):
+    ''' Do basic pre-processing of an XML tree. Assign IDs to all
+    items without. Propagate due dates, grace periods, etc. to child
+    items. 
+    '''
+    replace_custom_tags(tree)
+    id_tag(tree)
+    propogate_downward_tag(tree, "due")
+    propogate_downward_tag(tree, "graded")
+    propogate_downward_tag(tree, "graceperiod")
+    propogate_downward_tag(tree, "showanswer")
+    propogate_downward_tag(tree, "rerandomize")
+    return tree
+
+
+def toc_from_xml(dom, active_chapter, active_section):
+    '''
+    Create a table of contents from the course xml.
+
+    Return format:
+    [ {'name': name, 'sections': SECTIONS, 'active': bool}, ... ]
+
+    where SECTIONS is a list
+    [ {'name': name, 'format': format, 'due': due, 'active' : bool}, ...]
+
+    active is set for the section and chapter corresponding to the passed
+    parameters.  Everything else comes from the xml, or defaults to "".
+
+    chapters with name 'hidden' are skipped.
+    '''
+    name = dom.xpath('//course/@name')[0]
+
+    chapters = dom.xpath('//course[@name=$name]/chapter', name=name)
+    ch = list()
+    for c in chapters:
+        if c.get('name') == 'hidden':
+            continue
+        sections = list()
+        for s in dom.xpath('//course[@name=$name]/chapter[@name=$chname]/section',
+                           name=name, chname=c.get('name')):
+            
+            format = s.get("subtitle") if s.get("subtitle") else s.get("format") or ""
+            active = (c.get("name") == active_chapter and
+                      s.get("name") == active_section)
+
+            sections.append({'name': s.get("name") or "", 
+                             'format': format, 
+                             'due': s.get("due") or "",
+                             'active': active})
+            
+        ch.append({'name': c.get("name"), 
+                   'sections': sections,
+                   'active': c.get("name") == active_chapter})
+    return ch
+
+
+def replace_custom_tags_dir(tree, dir):
+    '''
+    Process tree to replace all custom tags defined in dir.
+    '''
+    tags = os.listdir(dir)
+    for tag in tags:
+        for element in tree.iter(tag):
+            element.tag = 'customtag'
+            impl = etree.SubElement(element, 'impl')
+            impl.text = tag
+
+def parse_course_file(filename, options, namespace):
+    '''
+    Parse a course file with the given options, and return the resulting
+    xml tree object.
+    
+    Options should be a dictionary including keys
+        'dev_content': bool,
+        'groups' : [list, of, user, groups]
+    '''
+    xml = etree.XML(render_to_string(filename, options, namespace = 'course'))
+    return course_xml_process(xml)
+
+
+def get_section(section, options, dirname):
+    '''
+    Given the name of a section, an options dict containing keys
+    'dev_content' and 'groups', and a directory to look in,
+    returns the xml tree for the section, or None if there's no
+    such section.
+    ''' 
+    filename = section + ".xml"
+
+    if filename not in os.listdir(dirname):
+        log.error(filename + " not in " + str(os.listdir(dirname)))
+        return None
+
+    tree = parse_course_file(filename, options, namespace='sections')
+    return tree
+
+
+def get_module(tree, module, id_tag, module_id, sections_dirname, options):
+    '''
+    Given the xml tree of the course, get the xml string for a module
+    with the specified module type, id_tag, module_id.  Looks in
+    sections_dirname for sections.
+    ''' 
+        # Sanitize input
+    if not module.isalnum():
+        raise Exception("Module is not alphanumeric")
+        
+    if not module_id.isalnum():
+        raise Exception("Module ID is not alphanumeric")
+
+    # Generate search
+    xpath_search='//{module}[(@{id_tag} = "{id}") or (@id = "{id}")]'.format(
+        module=module, 
+        id_tag=id_tag,
+        id=module_id)
+    
+
+    result_set = tree.xpath(xpath_search)
+    if len(result_set) < 1:
+        # Not found in main tree.  Let's look in the section files.
+        section_list = (s[:-4] for s in os.listdir(sections_dirname) if s[-4:]=='.xml')
+        for section in section_list:
+            try: 
+                s = get_section(section, options, sections_dirname)
+            except etree.XMLSyntaxError: 
+                ex = sys.exc_info()
+                raise ContentException("Malformed XML in " + section +
+                                       "(" + str(ex[1].msg) + ")")
+            result_set = s.xpath(xpath_search)
+            if len(result_set) != 0:
+                break
+
+    if len(result_set) > 1:
+        log.error("WARNING: Potentially malformed course file", module, module_id)
+        
+    if len(result_set)==0:
+        log.error('[content_parser.get_module] cannot find %s in course.xml tree',
+                      xpath_search)
+        log.error('tree = %s' % etree.tostring(tree, pretty_print=True))
+        return None
+
+    # log.debug('[courseware.content_parser.module_xml] found %s' % result_set)
+
+    return etree.tostring(result_set[0])
+
+
+
+
+
+
+# ==== All Django-specific code below =============================================
+
 def user_groups(user):
     if not user.is_authenticated():
         return []
@@ -135,154 +264,94 @@ def user_groups(user):
 
     return group_names
 
-    # return [u.name for u in UserTestGroup.objects.raw("select * from auth_user, student_usertestgroup, student_usertestgroup_users where auth_user.id = student_usertestgroup_users.user_id and student_usertestgroup_users.usertestgroup_id = student_usertestgroup.id and auth_user.id = %s", [user.id])]
+
+def get_options(user):
+    return {'dev_content': settings.DEV_CONTENT, 
+            'groups': user_groups(user)}
+
 
 def replace_custom_tags(tree):
-    tags = os.listdir(settings.DATA_DIR+'/custom_tags')
-    for tag in tags:
-        for element in tree.iter(tag):
-            element.tag = 'customtag'
-            impl = etree.SubElement(element, 'impl')
-            impl.text = tag
+    '''Replace custom tags defined in our custom_tags dir'''
+    replace_custom_tags_dir(tree, settings.DATA_DIR+'/custom_tags')
 
-def course_xml_process(tree):
-    ''' Do basic pre-processing of an XML tree. Assign IDs to all
-    items without. Propagate due dates, grace periods, etc. to child
-    items. 
+
+def course_file(user, coursename=None):
+    ''' Given a user, return an xml tree object for the course file.
+
+    Handles getting the right file, and processing it depending on the
+    groups the user is in.  Does caching of the xml strings.
     '''
-    replace_custom_tags(tree)
-    id_tag(tree)
-    propogate_downward_tag(tree, "due")
-    propogate_downward_tag(tree, "graded")
-    propogate_downward_tag(tree, "graceperiod")
-    propogate_downward_tag(tree, "showanswer")
-    propogate_downward_tag(tree, "rerandomize")
-    return tree
-
-def course_file(user,coursename=None):
-    ''' Given a user, return course.xml'''
 
     if user.is_authenticated():
-        filename = UserProfile.objects.get(user=user).courseware # user.profile_cache.courseware 
+        # use user.profile_cache.courseware?
+        filename = UserProfile.objects.get(user=user).courseware 
     else:
         filename = 'guest_course.xml'
 
-    # if a specific course is specified, then use multicourse to get the right path to the course XML directory
+    # if a specific course is specified, then use multicourse to get
+    # the right path to the course XML directory
     if coursename and settings.ENABLE_MULTICOURSE:
         xp = multicourse_settings.get_course_xmlpath(coursename)
         filename = xp + filename	# prefix the filename with the path
 
     groups = user_groups(user)
-    options = {'dev_content':settings.DEV_CONTENT, 
-               'groups' : groups}
+    options = get_options(user)
 
+    # Try the cache...
+    cache_key = "{0}_processed?dev_content:{1}&groups:{2}".format(
+        filename,
+        options['dev_content'],
+        sorted(groups))
     
-    cache_key = filename + "_processed?dev_content:" + str(options['dev_content']) + "&groups:" + str(sorted(groups))
-    if "dev" not in settings.DEFAULT_GROUPS:
-        tree_string = cache.get(cache_key)
-    else: 
+    if "dev" in settings.DEFAULT_GROUPS:
         tree_string = None
+    else: 
+        tree_string = cache.get(cache_key)
 
-    if settings.DEBUG:
-        log.info('[courseware.content_parser.course_file] filename=%s, cache_key=%s' % (filename,cache_key))
-        # print '[courseware.content_parser.course_file] tree_string = ',tree_string
-
-    if not tree_string:
-        tree = course_xml_process(etree.XML(render_to_string(filename, options, namespace = 'course')))
-        tree_string = etree.tostring(tree)
-        
-        cache.set(cache_key, tree_string, 60)
-    else:
+    if tree_string:
         tree = etree.XML(tree_string)
+    else:
+        tree = parse_course_file(filename, options, namespace='course')
+        # Cache it
+        tree_string = etree.tostring(tree)
+        cache.set(cache_key, tree_string, 60)
 
     return tree
 
-def section_file(user, section, coursename=None, dironly=False):
+
+def sections_dir(coursename=None):
+    ''' Get directory where sections information is stored.
+    '''
+    # if a specific course is specified, then use multicourse to get the
+    # right path to the course XML directory
+    xp = ''
+    if coursename and settings.ENABLE_MULTICOURSE:
+        xp = multicourse_settings.get_course_xmlpath(coursename)
+
+    return settings.DATA_DIR + xp + '/sections/'
+    
+
+
+def section_file(user, section, coursename=None):
     '''
     Given a user and the name of a section, return that section.
     This is done specific to each course.
-    If dironly=True then return the sections directory.
-    TODO: This is a bit weird; dironly should be scrapped. 
+
+    Returns the xml tree for the section, or None if there's no such section.
     '''
-    filename = section+".xml"
+    dirname = sections_dir(coursename)
 
-    # if a specific course is specified, then use multicourse to get the right path to the course XML directory
-    xp = ''
-    if coursename and settings.ENABLE_MULTICOURSE: xp = multicourse_settings.get_course_xmlpath(coursename)
 
-    dirname = settings.DATA_DIR + xp + '/sections/'
-
-    if dironly: return dirname
-
-    if filename not in os.listdir(dirname):
-        log.error(filename+" not in "+str(os.listdir(dirname)))
-        return None
-
-    options = {'dev_content':settings.DEV_CONTENT, 
-               'groups' : user_groups(user)}
-
-    tree = course_xml_process(etree.XML(render_to_string(filename, options, namespace = 'sections')))
-    return tree
+    return get_section(section, options, dirname)
 
 
 def module_xml(user, module, id_tag, module_id, coursename=None):
     ''' Get XML for a module based on module and module_id. Assumes
-        module occurs once in courseware XML file or hidden section. '''
-    # Sanitize input
-    if not module.isalnum():
-        raise Exception("Module is not alphanumeric")
-    if not module_id.isalnum():
-        raise Exception("Module ID is not alphanumeric")
-    # Generate search
-    xpath_search='//{module}[(@{id_tag} = "{id}") or (@id = "{id}")]'.format(module=module, 
-                                                                             id_tag=id_tag,
-                                                                             id=module_id)
-    #result_set=doc.xpathEval(xpath_search)
-    doc = course_file(user,coursename)
-    sdirname = section_file(user,'',coursename,True)	# get directory where sections information is stored
-    section_list = (s[:-4] for s in os.listdir(sdirname) if s[-4:]=='.xml')
+        module occurs once in courseware XML file or hidden section.
+    ''' 
+    tree = course_file(user, coursename)
+    sdirname = sections_dir(coursename)
+    options = get_options(user)
 
-    result_set=doc.xpath(xpath_search)
-    if len(result_set)<1:
-        for section in section_list:
-            try: 
-                s = section_file(user, section, coursename)
-            except etree.XMLSyntaxError: 
-                ex= sys.exc_info()
-                raise ContentException("Malformed XML in " + section+ "("+str(ex[1].msg)+")")
-            result_set = s.xpath(xpath_search)
-            if len(result_set) != 0: 
-                break
-
-    if len(result_set)>1:
-        log.error("WARNING: Potentially malformed course file", module, module_id)
-    if len(result_set)==0:
-        if settings.DEBUG:
-            log.error('[courseware.content_parser.module_xml] cannot find %s in course.xml tree' % xpath_search)
-            log.error('tree = %s' % etree.tostring(doc,pretty_print=True))
-        return None
-    if settings.DEBUG:
-        log.info('[courseware.content_parser.module_xml] found %s' % result_set)
-    return etree.tostring(result_set[0])
-    #return result_set[0].serialize()
-
-def toc_from_xml(dom, active_chapter, active_section):
-    name = dom.xpath('//course/@name')[0]
-
-    chapters = dom.xpath('//course[@name=$name]/chapter', name=name)
-    ch=list()
-    for c in chapters:
-        if c.get('name') == 'hidden':
-            continue
-        sections=list()
-        for s in dom.xpath('//course[@name=$name]/chapter[@name=$chname]/section', name=name, chname=c.get('name')): 
-            sections.append({'name':s.get("name") or "", 
-                             'format':s.get("subtitle") if s.get("subtitle") else s.get("format") or "", 
-                             'due':s.get("due") or "",
-                             'active':(c.get("name")==active_chapter and \
-                                           s.get("name")==active_section)})
-        ch.append({'name':c.get("name"), 
-                   'sections':sections,
-                   'active':(c.get("name")==active_chapter)})
-    return ch
+    return get_module(tree, module, id_tag, module_id, sdirname, options)