Merge pull request #17413 from edx/bmedx/upgrade_nltk

Upgrade NLTK to 3.2.5, fixup usage and tests
This commit is contained in:
Brian Mesick
2018-02-07 15:39:22 -05:00
committed by GitHub
7 changed files with 120 additions and 107 deletions

View File

@@ -9,7 +9,7 @@ from __future__ import division
import string
import nltk
from nltk.metrics.distance import edit_distance
from django.conf import settings
from django.core.exceptions import ValidationError
from django.utils.translation import ugettext_lazy as _
@@ -106,6 +106,6 @@ def validate_password_dictionary(value):
if password_max_edit_distance and password_dictionary:
for word in password_dictionary:
distance = nltk.metrics.distance.edit_distance(value, word)
distance = edit_distance(value, word)
if distance <= password_max_edit_distance:
raise ValidationError(_("Too similar to a restricted dictionary word."), code="dictionary_word")

View File

@@ -51,9 +51,9 @@ def quote_attr(s):
class OptionInputTest(unittest.TestCase):
'''
"""
Make sure option inputs work
'''
"""
def test_rendering(self):
xml_str = """<optioninput options="('Up','Down','Don't know')" id="sky_input" correct="Up"/>"""
@@ -89,7 +89,9 @@ class OptionInputTest(unittest.TestCase):
f = inputtypes.OptionInput.parse_options
def check(input, options):
"""Take list of options, confirm that output is in the silly doubled format"""
"""
Take list of options, confirm that output is in the silly doubled format
"""
expected = [(o, o) for o in options]
self.assertEqual(f(input), expected)
@@ -108,9 +110,9 @@ class OptionInputTest(unittest.TestCase):
class ChoiceGroupTest(unittest.TestCase):
'''
"""
Test choice groups, radio groups, and checkbox groups
'''
"""
def check_group(self, tag, expected_input_type, expected_suffix):
xml_str = """
@@ -248,9 +250,9 @@ class JSInputTest(unittest.TestCase):
class TextLineTest(unittest.TestCase):
'''
"""
Check that textline inputs work, with and without math.
'''
"""
def test_rendering(self):
size = "42"
@@ -369,9 +371,9 @@ class TextLineTest(unittest.TestCase):
class FileSubmissionTest(unittest.TestCase):
'''
"""
Check that file submission inputs work
'''
"""
def test_rendering(self):
allowed_files = "runme.py nooooo.rb ohai.java"
@@ -413,9 +415,9 @@ class FileSubmissionTest(unittest.TestCase):
class CodeInputTest(unittest.TestCase):
'''
"""
Check that codeinput inputs work
'''
"""
def test_rendering(self):
mode = "parrot"
@@ -434,8 +436,6 @@ class CodeInputTest(unittest.TestCase):
element = etree.fromstring(xml_str)
escapedict = {'"': '&quot;'}
state = {
'value': 'print "good evening"',
'status': 'incomplete',
@@ -471,9 +471,9 @@ class CodeInputTest(unittest.TestCase):
class MatlabTest(unittest.TestCase):
'''
"""
Test Matlab input types
'''
"""
def setUp(self):
super(MatlabTest, self).setUp()
self.rows = '10'
@@ -921,10 +921,9 @@ def html_tree_equal(received, expected):
class SchematicTest(unittest.TestCase):
'''
"""
Check that schematic inputs work
'''
"""
def test_rendering(self):
height = '12'
width = '33'
@@ -977,10 +976,9 @@ class SchematicTest(unittest.TestCase):
class ImageInputTest(unittest.TestCase):
'''
"""
Check that image inputs work
'''
"""
def check(self, value, egx, egy):
height = '78'
width = '427'
@@ -1037,10 +1035,9 @@ class ImageInputTest(unittest.TestCase):
class CrystallographyTest(unittest.TestCase):
'''
"""
Check that crystallography inputs work
'''
"""
def test_rendering(self):
height = '12'
width = '33'
@@ -1079,10 +1076,9 @@ class CrystallographyTest(unittest.TestCase):
class VseprTest(unittest.TestCase):
'''
"""
Check that vsepr inputs work
'''
"""
def test_rendering(self):
height = '12'
width = '33'
@@ -1127,9 +1123,9 @@ class VseprTest(unittest.TestCase):
class ChemicalEquationTest(unittest.TestCase):
'''
"""
Check that chemical equation inputs work.
'''
"""
def setUp(self):
super(ChemicalEquationTest, self).setUp()
self.size = "42"
@@ -1144,7 +1140,9 @@ class ChemicalEquationTest(unittest.TestCase):
self.the_input = lookup_tag('chemicalequationinput')(test_capa_system(), element, state)
def test_rendering(self):
''' Verify that the render context matches the expected render context'''
"""
Verify that the render context matches the expected render context
"""
context = self.the_input._get_render_context() # pylint: disable=protected-access
prob_id = 'prob_1_2'
expected = {
@@ -1161,7 +1159,9 @@ class ChemicalEquationTest(unittest.TestCase):
self.assertEqual(context, expected)
def test_chemcalc_ajax_sucess(self):
''' Verify that using the correct dispatch and valid data produces a valid response'''
"""
Verify that using the correct dispatch and valid data produces a valid response
"""
data = {'formula': "H"}
response = self.the_input.handle_ajax("preview_chemcalc", data)
@@ -1366,10 +1366,9 @@ class FormulaEquationTest(unittest.TestCase):
class DragAndDropTest(unittest.TestCase):
'''
"""
Check that drag and drop inputs work
'''
"""
def test_rendering(self):
path_to_images = '/dummy-static/images/'
@@ -1441,9 +1440,9 @@ class DragAndDropTest(unittest.TestCase):
class AnnotationInputTest(unittest.TestCase):
'''
"""
Make sure option inputs work
'''
"""
def test_rendering(self):
xml_str = '''
<annotationinput>

View File

@@ -8,7 +8,7 @@ from pyparsing import Literal, OneOrMore, ParseException, StringEnd
ARROWS = ('<->', '->')
## Defines a simple pyparsing tokenizer for chemical equations
# Defines a simple pyparsing tokenizer for chemical equations
elements = ['Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be',
'Bh', 'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm',
'Cn', 'Co', 'Cr', 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu',
@@ -30,7 +30,8 @@ tokenizer = OneOrMore(tokens) + StringEnd()
def _orjoin(l):
return "'" + "' | '".join(l) + "'"
## Defines an NLTK parser for tokenized expressions
# Defines an NLTK parser for tokenized expressions
grammar = """
S -> multimolecule | multimolecule '+' S
multimolecule -> count molecule | molecule
@@ -52,16 +53,19 @@ grammar = """
suffixed -> unsuffixed | unsuffixed suffix
"""
parser = nltk.ChartParser(nltk.parse_cfg(grammar))
parser = nltk.ChartParser(nltk.CFG.fromstring(grammar))
def _clean_parse_tree(tree):
''' The parse tree contains a lot of redundant
"""
The parse tree contains a lot of redundant
nodes. E.g. paren_groups have groups as children, etc. This will
clean up the tree.
'''
"""
def unparse_number(n):
''' Go from a number parse tree to a number '''
"""
Go from a number parse tree to a number
"""
if len(n) == 1:
rv = n[0][0]
else:
@@ -69,19 +73,22 @@ def _clean_parse_tree(tree):
return rv
def null_tag(n):
''' Remove a tag '''
"""
Remove a tag
"""
return n[0]
def ion_suffix(n):
'''1. "if" part handles special case
2. "else" part is general behaviour '''
if n[1:][0].node == 'number' and n[1:][0][0][0] == '1':
"""
1. "if" part handles special case
2. "else" part is general behaviour
"""
if n[1:][0].label() == 'number' and n[1:][0][0][0] == '1':
# if suffix is explicitly 1, like ^1-
# strip 1, leave only sign: ^-
return nltk.tree.Tree(n.node, n[2:])
return nltk.tree.Tree(n.label(), n[2:])
else:
return nltk.tree.Tree(n.node, n[1:])
return nltk.tree.Tree(n.label(), n[1:])
dispatch = {'number': lambda x: nltk.tree.Tree("number", [unparse_number(x)]),
'unphased': null_tag,
@@ -89,40 +96,38 @@ def _clean_parse_tree(tree):
'number_suffix': lambda x: nltk.tree.Tree('number_suffix', [unparse_number(x[0])]),
'suffixed': lambda x: len(x) > 1 and x or x[0],
'ion_suffix': ion_suffix,
'paren_group_square': lambda x: nltk.tree.Tree(x.node, x[1]),
'paren_group_round': lambda x: nltk.tree.Tree(x.node, x[1])}
'paren_group_square': lambda x: nltk.tree.Tree(x.label(), x[1]),
'paren_group_round': lambda x: nltk.tree.Tree(x.label(), x[1])}
if isinstance(tree, str):
return tree
old_node = None
## This loop means that if a node is processed, and returns a child,
## the child will be processed.
while tree.node in dispatch and tree.node != old_node:
old_node = tree.node
tree = dispatch[tree.node](tree)
# This loop means that if a node is processed, and returns a child,
# the child will be processed.
while tree.label() in dispatch and tree.label() != old_node:
old_node = tree.label()
tree = dispatch[tree.label()](tree)
children = []
for child in tree:
child = _clean_parse_tree(child)
children.append(child)
tree = nltk.tree.Tree(tree.node, children)
tree = nltk.tree.Tree(tree.label(), children)
return tree
def _merge_children(tree, tags):
''' nltk, by documentation, cannot do arbitrary length
groups. Instead of:
(group 1 2 3 4)
It has to handle this recursively:
(group 1 (group 2 (group 3 (group 4))))
"""
nltk, by documentation, cannot do arbitrary length groups.
Instead of: (group 1 2 3 4)
It has to handle this recursively: (group 1 (group 2 (group 3 (group 4))))
We do the cleanup of converting from the latter to the former.
'''
"""
if tree is None:
# There was a problem--shouldn't have empty trees (NOTE: see this with input e.g. 'H2O(', or 'Xe+').
# Haven't grokked the code to tell if this is indeed the right thing to do.
raise ParseException("Shouldn't have empty trees")
if isinstance(tree, str):
@@ -130,32 +135,31 @@ def _merge_children(tree, tags):
merged_children = []
done = False
#print '00000', tree
## Merge current tag
# Merge current tag
while not done:
done = True
for child in tree:
if isinstance(child, nltk.tree.Tree) and child.node == tree.node and tree.node in tags:
if isinstance(child, nltk.tree.Tree) and child.label() == tree.label() and tree.label() in tags:
merged_children = merged_children + list(child)
done = False
else:
merged_children = merged_children + [child]
tree = nltk.tree.Tree(tree.node, merged_children)
tree = nltk.tree.Tree(tree.label(), merged_children)
merged_children = []
#print '======',tree
# And recurse
children = []
for child in tree:
children.append(_merge_children(child, tags))
#return tree
return nltk.tree.Tree(tree.node, children)
return nltk.tree.Tree(tree.label(), children)
def _render_to_html(tree):
''' Renders a cleaned tree to HTML '''
"""
Renders a cleaned tree to HTML
"""
def molecule_count(tree, children):
# If an integer, return that integer
if len(tree) == 1:
@@ -187,24 +191,28 @@ def _render_to_html(tree):
return tree
else:
children = "".join(map(_render_to_html, tree))
if tree.node in dispatch:
return dispatch[tree.node](tree, children)
if tree.label() in dispatch:
return dispatch[tree.label()](tree, children)
else:
return children.replace(' ', '')
def render_to_html(eq):
'''
"""
Render a chemical equation string to html.
Renders each molecule separately, and returns invalid input wrapped in a <span>.
'''
"""
def err(s):
"Render as an error span"
"""
Render as an error span
"""
return '<span class="inline-error inline">{0}</span>'.format(s)
def render_arrow(arrow):
"""Turn text arrows into pretty ones"""
"""
Turn text arrows into pretty ones
"""
if arrow == '->':
return u'\u2192'
if arrow == '<->':
@@ -235,20 +243,26 @@ def render_to_html(eq):
def _get_final_tree(s):
'''
"""
Return final tree after merge and clean.
Raises pyparsing.ParseException if s is invalid.
'''
tokenized = tokenizer.parseString(s)
parsed = parser.parse(tokenized)
merged = _merge_children(parsed, {'S', 'group'})
final = _clean_parse_tree(merged)
return final
"""
try:
tokenized = tokenizer.parseString(s)
parsed = parser.parse(tokenized)
merged = _merge_children(parsed.next(), {'S', 'group'})
final = _clean_parse_tree(merged)
return final
except StopIteration:
# This happens with an empty tree- see this with input e.g. 'H2O(', or 'Xe+').
raise ParseException("Shouldn't have empty trees")
def _check_equality(tuple1, tuple2):
''' return True if tuples of multimolecules are equal '''
"""
return True if tuples of multimolecules are equal
"""
list1 = list(tuple1)
list2 = list(tuple2)
@@ -264,14 +278,16 @@ def _check_equality(tuple1, tuple2):
def compare_chemical_expression(s1, s2, ignore_state=False):
''' It does comparison between two expressions.
It uses divide_chemical_expression and check if division is 1
'''
"""
It does comparison between two expressions.
It uses divide_chemical_expression and check if division is 1
"""
return divide_chemical_expression(s1, s2, ignore_state) == 1
def divide_chemical_expression(s1, s2, ignore_state=False):
'''Compare two chemical expressions for equivalence up to a multiplicative factor:
"""
Compare two chemical expressions for equivalence up to a multiplicative factor:
- If they are not the same chemicals, returns False.
- If they are the same, "divide" s1 by s2 to returns a factor x such that s1 / s2 == x as a Fraction object.
@@ -290,12 +306,13 @@ def divide_chemical_expression(s1, s2, ignore_state=False):
for equality of every element in list,
- return result of factor division
'''
"""
# parsed final trees
treedic = {}
treedic['1'] = _get_final_tree(s1)
treedic['2'] = _get_final_tree(s2)
treedic = {
'1': _get_final_tree(s1),
'2': _get_final_tree(s2)
}
# strip phases and factors
# collect factors in list
@@ -303,10 +320,10 @@ def divide_chemical_expression(s1, s2, ignore_state=False):
treedic[i + ' cleaned_mm_list'] = []
treedic[i + ' factors'] = []
treedic[i + ' phases'] = []
for el in treedic[i].subtrees(filter=lambda t: t.node == 'multimolecule'):
count_subtree = [t for t in el.subtrees() if t.node == 'count']
group_subtree = [t for t in el.subtrees() if t.node == 'group']
phase_subtree = [t for t in el.subtrees() if t.node == 'phase']
for el in treedic[i].subtrees(filter=lambda t: t.label() == 'multimolecule'):
count_subtree = [t for t in el.subtrees() if t.label() == 'count']
group_subtree = [t for t in el.subtrees() if t.label() == 'group']
phase_subtree = [t for t in el.subtrees() if t.label() == 'phase']
if count_subtree:
if len(count_subtree[0]) > 1:
treedic[i + ' factors'].append(

View File

@@ -8,6 +8,6 @@ setup(
"pyparsing==2.0.7",
"numpy==1.6.2",
"scipy==0.14.0",
"nltk==2.0.6",
"nltk==3.2.5",
],
)

View File

@@ -8,12 +8,9 @@
setuptools==37.0.0
pip==9.0.1
nltk==3.2.5
numpy==1.6.2
networkx==1.7
sympy==0.7.1
pyparsing==2.0.7
cryptography==1.9
# We forked NLTK just to make it work with setuptools instead of distribute
git+https://github.com/edx/nltk.git@2.0.6#egg=nltk==2.0.6

View File

@@ -79,6 +79,7 @@ Markdown>=2.6,<2.7
mongoengine==0.10.0
MySQL-python==1.2.5
networkx==1.7
nltk==3.2.5
nose-xunitmp==0.3.2
oauthlib==1.0.3
path.py==8.2.1

View File

@@ -53,7 +53,6 @@ git+https://github.com/jazzband/django-pipeline.git@d068a019169c9de5ee20ece041a6
-e git+https://github.com/edx/django-wiki.git@v0.0.17#egg=django-wiki
git+https://github.com/edx/django-openid-auth.git@0.14#egg=django-openid-auth==0.14
git+https://github.com/edx/MongoDBProxy.git@25b99097615bda06bd7cdfe5669ed80dc2a7fed0#egg=MongoDBProxy==0.1.0
git+https://github.com/edx/nltk.git@2.0.6#egg=nltk==2.0.6
-e git+https://github.com/dementrock/pystache_custom.git@776973740bdaad83a3b029f96e415a7d1e8bec2f#egg=pystache_custom-dev
-e git+https://github.com/appliedsec/pygeoip.git@95e69341cebf5a6a9fbf7c4f5439d458898bdc3b#egg=pygeoip
-e git+https://github.com/jazkarta/edx-jsme.git@690dbf75441fa91c7c4899df0b83d77f7deb5458#egg=edx-jsme