Merge pull request #17413 from edx/bmedx/upgrade_nltk

Upgrade NLTK to 3.2.5, fixup usage and tests
2018-02-07 15:39:22 -05:00
parent 4e7a602e67 6a22de454f
commit 86d077ac3d
7 changed files with 120 additions and 107 deletions
--- a/common/djangoapps/util/password_policy_validators.py
+++ b/common/djangoapps/util/password_policy_validators.py
@@ -9,7 +9,7 @@ from __future__ import division

 import string

-import nltk
+from nltk.metrics.distance import edit_distance
 from django.conf import settings
 from django.core.exceptions import ValidationError
 from django.utils.translation import ugettext_lazy as _
@@ -106,6 +106,6 @@ def validate_password_dictionary(value):

    if password_max_edit_distance and password_dictionary:
        for word in password_dictionary:
-            distance = nltk.metrics.distance.edit_distance(value, word)
+            distance = edit_distance(value, word)
            if distance <= password_max_edit_distance:
                raise ValidationError(_("Too similar to a restricted dictionary word."), code="dictionary_word")
--- a/common/lib/capa/capa/tests/test_inputtypes.py
+++ b/common/lib/capa/capa/tests/test_inputtypes.py
@@ -51,9 +51,9 @@ def quote_attr(s):


 class OptionInputTest(unittest.TestCase):
-    '''
+    """
    Make sure option inputs work
-    '''
+    """

    def test_rendering(self):
        xml_str = """<optioninput options="('Up','Down','Don't know')" id="sky_input" correct="Up"/>"""
@@ -89,7 +89,9 @@ class OptionInputTest(unittest.TestCase):
        f = inputtypes.OptionInput.parse_options

        def check(input, options):
-            """Take list of options, confirm that output is in the silly doubled format"""
+            """
+            Take list of options, confirm that output is in the silly doubled format
+            """
            expected = [(o, o) for o in options]
            self.assertEqual(f(input), expected)

@@ -108,9 +110,9 @@ class OptionInputTest(unittest.TestCase):


 class ChoiceGroupTest(unittest.TestCase):
-    '''
+    """
    Test choice groups, radio groups, and checkbox groups
-    '''
+    """

    def check_group(self, tag, expected_input_type, expected_suffix):
        xml_str = """
@@ -248,9 +250,9 @@ class JSInputTest(unittest.TestCase):


 class TextLineTest(unittest.TestCase):
-    '''
+    """
    Check that textline inputs work, with and without math.
-    '''
+    """

    def test_rendering(self):
        size = "42"
@@ -369,9 +371,9 @@ class TextLineTest(unittest.TestCase):


 class FileSubmissionTest(unittest.TestCase):
-    '''
+    """
    Check that file submission inputs work
-    '''
+    """

    def test_rendering(self):
        allowed_files = "runme.py nooooo.rb ohai.java"
@@ -413,9 +415,9 @@ class FileSubmissionTest(unittest.TestCase):


 class CodeInputTest(unittest.TestCase):
-    '''
+    """
    Check that codeinput inputs work
-    '''
+    """

    def test_rendering(self):
        mode = "parrot"
@@ -434,8 +436,6 @@ class CodeInputTest(unittest.TestCase):

        element = etree.fromstring(xml_str)

-        escapedict = {'"': '&quot;'}
-
        state = {
            'value': 'print "good evening"',
            'status': 'incomplete',
@@ -471,9 +471,9 @@ class CodeInputTest(unittest.TestCase):


 class MatlabTest(unittest.TestCase):
-    '''
+    """
    Test Matlab input types
-    '''
+    """
    def setUp(self):
        super(MatlabTest, self).setUp()
        self.rows = '10'
@@ -921,10 +921,9 @@ def html_tree_equal(received, expected):


 class SchematicTest(unittest.TestCase):
-    '''
+    """
    Check that schematic inputs work
-    '''
-
+    """
    def test_rendering(self):
        height = '12'
        width = '33'
@@ -977,10 +976,9 @@ class SchematicTest(unittest.TestCase):


 class ImageInputTest(unittest.TestCase):
-    '''
+    """
    Check that image inputs work
-    '''
-
+    """
    def check(self, value, egx, egy):
        height = '78'
        width = '427'
@@ -1037,10 +1035,9 @@ class ImageInputTest(unittest.TestCase):


 class CrystallographyTest(unittest.TestCase):
-    '''
+    """
    Check that crystallography inputs work
-    '''
-
+    """
    def test_rendering(self):
        height = '12'
        width = '33'
@@ -1079,10 +1076,9 @@ class CrystallographyTest(unittest.TestCase):


 class VseprTest(unittest.TestCase):
-    '''
+    """
    Check that vsepr inputs work
-    '''
-
+    """
    def test_rendering(self):
        height = '12'
        width = '33'
@@ -1127,9 +1123,9 @@ class VseprTest(unittest.TestCase):


 class ChemicalEquationTest(unittest.TestCase):
-    '''
+    """
    Check that chemical equation inputs work.
-    '''
+    """
    def setUp(self):
        super(ChemicalEquationTest, self).setUp()
        self.size = "42"
@@ -1144,7 +1140,9 @@ class ChemicalEquationTest(unittest.TestCase):
        self.the_input = lookup_tag('chemicalequationinput')(test_capa_system(), element, state)

    def test_rendering(self):
-        ''' Verify that the render context matches the expected render context'''
+        """
+        Verify that the render context matches the expected render context
+        """
        context = self.the_input._get_render_context()  # pylint: disable=protected-access
        prob_id = 'prob_1_2'
        expected = {
@@ -1161,7 +1159,9 @@ class ChemicalEquationTest(unittest.TestCase):
        self.assertEqual(context, expected)

    def test_chemcalc_ajax_sucess(self):
-        ''' Verify that using the correct dispatch and valid data produces a valid response'''
+        """
+        Verify that using the correct dispatch and valid data produces a valid response
+        """
        data = {'formula': "H"}
        response = self.the_input.handle_ajax("preview_chemcalc", data)

@@ -1366,10 +1366,9 @@ class FormulaEquationTest(unittest.TestCase):


 class DragAndDropTest(unittest.TestCase):
-    '''
+    """
    Check that drag and drop inputs work
-    '''
-
+    """
    def test_rendering(self):
        path_to_images = '/dummy-static/images/'

@@ -1441,9 +1440,9 @@ class DragAndDropTest(unittest.TestCase):


 class AnnotationInputTest(unittest.TestCase):
-    '''
+    """
    Make sure option inputs work
-    '''
+    """
    def test_rendering(self):
        xml_str = '''
 <annotationinput>
--- a/common/lib/chem/chem/chemcalc.py
+++ b/common/lib/chem/chem/chemcalc.py
@@ -8,7 +8,7 @@ from pyparsing import Literal, OneOrMore, ParseException, StringEnd

 ARROWS = ('<->', '->')

-## Defines a simple pyparsing tokenizer for chemical equations
+# Defines a simple pyparsing tokenizer for chemical equations
 elements = ['Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be',
            'Bh', 'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm',
            'Cn', 'Co', 'Cr', 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu',
@@ -30,7 +30,8 @@ tokenizer = OneOrMore(tokens) + StringEnd()
 def _orjoin(l):
    return "'" + "' | '".join(l) + "'"

-## Defines an NLTK parser for tokenized expressions
+
+# Defines an NLTK parser for tokenized expressions
 grammar = """
  S -> multimolecule | multimolecule '+' S
  multimolecule -> count molecule | molecule
@@ -52,16 +53,19 @@ grammar = """

  suffixed -> unsuffixed | unsuffixed suffix
 """
-parser = nltk.ChartParser(nltk.parse_cfg(grammar))
+parser = nltk.ChartParser(nltk.CFG.fromstring(grammar))


 def _clean_parse_tree(tree):
-    ''' The parse tree contains a lot of redundant
+    """
+    The parse tree contains a lot of redundant
    nodes. E.g. paren_groups have groups as children, etc. This will
    clean up the tree.
-    '''
+    """
    def unparse_number(n):
-        ''' Go from a number parse tree to a number '''
+        """
+        Go from a number parse tree to a number
+        """
        if len(n) == 1:
            rv = n[0][0]
        else:
@@ -69,19 +73,22 @@ def _clean_parse_tree(tree):
        return rv

    def null_tag(n):
-        ''' Remove a tag '''
+        """
+        Remove a tag
+        """
        return n[0]

    def ion_suffix(n):
-        '''1. "if" part handles special case
-           2. "else" part is general behaviour '''
-
-        if n[1:][0].node == 'number' and n[1:][0][0][0] == '1':
+        """
+        1. "if" part handles special case
+        2. "else" part is general behaviour
+        """
+        if n[1:][0].label() == 'number' and n[1:][0][0][0] == '1':
            # if suffix is explicitly 1, like ^1-
            # strip 1, leave only sign: ^-
-            return nltk.tree.Tree(n.node, n[2:])
+            return nltk.tree.Tree(n.label(), n[2:])
        else:
-            return nltk.tree.Tree(n.node, n[1:])
+            return nltk.tree.Tree(n.label(), n[1:])

    dispatch = {'number': lambda x: nltk.tree.Tree("number", [unparse_number(x)]),
                'unphased': null_tag,
@@ -89,40 +96,38 @@ def _clean_parse_tree(tree):
                'number_suffix': lambda x: nltk.tree.Tree('number_suffix', [unparse_number(x[0])]),
                'suffixed': lambda x: len(x) > 1 and x or x[0],
                'ion_suffix': ion_suffix,
-                'paren_group_square': lambda x: nltk.tree.Tree(x.node, x[1]),
-                'paren_group_round': lambda x: nltk.tree.Tree(x.node, x[1])}
+                'paren_group_square': lambda x: nltk.tree.Tree(x.label(), x[1]),
+                'paren_group_round': lambda x: nltk.tree.Tree(x.label(), x[1])}

    if isinstance(tree, str):
        return tree

    old_node = None
-    ## This loop means that if a node is processed, and returns a child,
-    ## the child will be processed.
-    while tree.node in dispatch and tree.node != old_node:
-        old_node = tree.node
-        tree = dispatch[tree.node](tree)
+    # This loop means that if a node is processed, and returns a child,
+    # the child will be processed.
+    while tree.label() in dispatch and tree.label() != old_node:
+        old_node = tree.label()
+        tree = dispatch[tree.label()](tree)

    children = []
    for child in tree:
        child = _clean_parse_tree(child)
        children.append(child)

-    tree = nltk.tree.Tree(tree.node, children)
+    tree = nltk.tree.Tree(tree.label(), children)

    return tree


 def _merge_children(tree, tags):
-    ''' nltk, by documentation, cannot do arbitrary length
-    groups. Instead of:
-    (group 1 2 3 4)
-    It has to handle this recursively:
-    (group 1 (group 2 (group 3 (group 4))))
+    """
+    nltk, by documentation, cannot do arbitrary length groups.
+    Instead of: (group 1 2 3 4)
+    It has to handle this recursively: (group 1 (group 2 (group 3 (group 4))))
    We do the cleanup of converting from the latter to the former.
-    '''
+    """
    if tree is None:
        # There was a problem--shouldn't have empty trees (NOTE: see this with input e.g. 'H2O(', or 'Xe+').
-        # Haven't grokked the code to tell if this is indeed the right thing to do.
        raise ParseException("Shouldn't have empty trees")

    if isinstance(tree, str):
@@ -130,32 +135,31 @@ def _merge_children(tree, tags):

    merged_children = []
    done = False
-    #print '00000', tree
-    ## Merge current tag
+
+    # Merge current tag
    while not done:
        done = True
        for child in tree:
-            if isinstance(child, nltk.tree.Tree) and child.node == tree.node and tree.node in tags:
+            if isinstance(child, nltk.tree.Tree) and child.label() == tree.label() and tree.label() in tags:
                merged_children = merged_children + list(child)
                done = False
            else:
                merged_children = merged_children + [child]
-        tree = nltk.tree.Tree(tree.node, merged_children)
+        tree = nltk.tree.Tree(tree.label(), merged_children)
        merged_children = []
-    #print '======',tree

    # And recurse
    children = []
    for child in tree:
        children.append(_merge_children(child, tags))

-    #return tree
-    return nltk.tree.Tree(tree.node, children)
+    return nltk.tree.Tree(tree.label(), children)


 def _render_to_html(tree):
-    ''' Renders a cleaned tree to HTML '''
-
+    """
+    Renders a cleaned tree to HTML
+    """
    def molecule_count(tree, children):
        # If an integer, return that integer
        if len(tree) == 1:
@@ -187,24 +191,28 @@ def _render_to_html(tree):
        return tree
    else:
        children = "".join(map(_render_to_html, tree))
-        if tree.node in dispatch:
-            return dispatch[tree.node](tree, children)
+        if tree.label() in dispatch:
+            return dispatch[tree.label()](tree, children)
        else:
            return children.replace(' ', '')


 def render_to_html(eq):
-    '''
+    """
    Render a chemical equation string to html.

    Renders each molecule separately, and returns invalid input wrapped in a <span>.
-    '''
+    """
    def err(s):
-        "Render as an error span"
+        """
+        Render as an error span
+        """
        return '<span class="inline-error inline">{0}</span>'.format(s)

    def render_arrow(arrow):
-        """Turn text arrows into pretty ones"""
+        """
+        Turn text arrows into pretty ones
+        """
        if arrow == '->':
            return u'\u2192'
        if arrow == '<->':
@@ -235,20 +243,26 @@ def render_to_html(eq):


 def _get_final_tree(s):
-    '''
+    """
    Return final tree after merge and clean.

    Raises pyparsing.ParseException if s is invalid.
-    '''
-    tokenized = tokenizer.parseString(s)
-    parsed = parser.parse(tokenized)
-    merged = _merge_children(parsed, {'S', 'group'})
-    final = _clean_parse_tree(merged)
-    return final
+    """
+    try:
+        tokenized = tokenizer.parseString(s)
+        parsed = parser.parse(tokenized)
+        merged = _merge_children(parsed.next(), {'S', 'group'})
+        final = _clean_parse_tree(merged)
+        return final
+    except StopIteration:
+        # This happens with an empty tree- see this with input e.g. 'H2O(', or 'Xe+').
+        raise ParseException("Shouldn't have empty trees")


 def _check_equality(tuple1, tuple2):
-    ''' return True if tuples of multimolecules are equal '''
+    """
+    return True if tuples of multimolecules are equal
+    """
    list1 = list(tuple1)
    list2 = list(tuple2)

@@ -264,14 +278,16 @@ def _check_equality(tuple1, tuple2):


 def compare_chemical_expression(s1, s2, ignore_state=False):
-    ''' It does comparison between two expressions.
-        It uses divide_chemical_expression and check if division is 1
-    '''
+    """
+    It does comparison between two expressions.
+    It uses divide_chemical_expression and check if division is 1
+    """
    return divide_chemical_expression(s1, s2, ignore_state) == 1


 def divide_chemical_expression(s1, s2, ignore_state=False):
-    '''Compare two chemical expressions for equivalence up to a multiplicative factor:
+    """
+    Compare two chemical expressions for equivalence up to a multiplicative factor:

    - If they are not the same chemicals, returns False.
    - If they are the same, "divide" s1 by s2 to returns a factor x such that s1 / s2 == x as a Fraction object.
@@ -290,12 +306,13 @@ def divide_chemical_expression(s1, s2, ignore_state=False):
             for equality of every element in list,
        - return result of factor division

-    '''
+    """

    # parsed final trees
-    treedic = {}
-    treedic['1'] = _get_final_tree(s1)
-    treedic['2'] = _get_final_tree(s2)
+    treedic = {
+        '1': _get_final_tree(s1),
+        '2': _get_final_tree(s2)
+    }

    # strip phases and factors
    # collect factors in list
@@ -303,10 +320,10 @@ def divide_chemical_expression(s1, s2, ignore_state=False):
        treedic[i + ' cleaned_mm_list'] = []
        treedic[i + ' factors'] = []
        treedic[i + ' phases'] = []
-        for el in treedic[i].subtrees(filter=lambda t: t.node == 'multimolecule'):
-            count_subtree = [t for t in el.subtrees() if t.node == 'count']
-            group_subtree = [t for t in el.subtrees() if t.node == 'group']
-            phase_subtree = [t for t in el.subtrees() if t.node == 'phase']
+        for el in treedic[i].subtrees(filter=lambda t: t.label() == 'multimolecule'):
+            count_subtree = [t for t in el.subtrees() if t.label() == 'count']
+            group_subtree = [t for t in el.subtrees() if t.label() == 'group']
+            phase_subtree = [t for t in el.subtrees() if t.label() == 'phase']
            if count_subtree:
                if len(count_subtree[0]) > 1:
                    treedic[i + ' factors'].append(
--- a/common/lib/chem/setup.py
+++ b/common/lib/chem/setup.py
@@ -8,6 +8,6 @@ setup(
        "pyparsing==2.0.7",
        "numpy==1.6.2",
        "scipy==0.14.0",
-        "nltk==2.0.6",
+        "nltk==3.2.5",
    ],
 )
--- a/requirements/edx-sandbox/base.txt
+++ b/requirements/edx-sandbox/base.txt
@@ -8,12 +8,9 @@
 setuptools==37.0.0
 pip==9.0.1

-
+nltk==3.2.5
 numpy==1.6.2
 networkx==1.7
 sympy==0.7.1
 pyparsing==2.0.7
 cryptography==1.9
-
-# We forked NLTK just to make it work with setuptools instead of distribute
-git+https://github.com/edx/nltk.git@2.0.6#egg=nltk==2.0.6
--- a/requirements/edx/base.txt
+++ b/requirements/edx/base.txt
@@ -79,6 +79,7 @@ Markdown>=2.6,<2.7
 mongoengine==0.10.0
 MySQL-python==1.2.5
 networkx==1.7
+nltk==3.2.5
 nose-xunitmp==0.3.2
 oauthlib==1.0.3
 path.py==8.2.1
--- a/requirements/edx/github.txt
+++ b/requirements/edx/github.txt
@@ -53,7 +53,6 @@ git+https://github.com/jazzband/django-pipeline.git@d068a019169c9de5ee20ece041a6
 -e git+https://github.com/edx/django-wiki.git@v0.0.17#egg=django-wiki
 git+https://github.com/edx/django-openid-auth.git@0.14#egg=django-openid-auth==0.14
 git+https://github.com/edx/MongoDBProxy.git@25b99097615bda06bd7cdfe5669ed80dc2a7fed0#egg=MongoDBProxy==0.1.0
-git+https://github.com/edx/nltk.git@2.0.6#egg=nltk==2.0.6
 -e git+https://github.com/dementrock/pystache_custom.git@776973740bdaad83a3b029f96e415a7d1e8bec2f#egg=pystache_custom-dev
 -e git+https://github.com/appliedsec/pygeoip.git@95e69341cebf5a6a9fbf7c4f5439d458898bdc3b#egg=pygeoip
 -e git+https://github.com/jazkarta/edx-jsme.git@690dbf75441fa91c7c4899df0b83d77f7deb5458#egg=edx-jsme