From bbd1d8d09ed1a9f39d657d19b485d9a56d39836d Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Thu, 16 May 2013 13:58:05 -0400 Subject: [PATCH] From code review: the hash was shallow, so nested objects could have hashed differently when they didn't need to. --- common/lib/capa/capa/safe_exec/__init__.py | 2 +- common/lib/capa/capa/safe_exec/safe_exec.py | 27 +++++++- .../capa/safe_exec/tests/test_safe_exec.py | 64 +++++++++++++++++-- 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/common/lib/capa/capa/safe_exec/__init__.py b/common/lib/capa/capa/safe_exec/__init__.py index bbfea1c138..ffbe8f2320 100644 --- a/common/lib/capa/capa/safe_exec/__init__.py +++ b/common/lib/capa/capa/safe_exec/__init__.py @@ -1,3 +1,3 @@ """Capa's specialized use of codejail.safe_exec.""" -from .safe_exec import safe_exec +from .safe_exec import safe_exec, update_hash diff --git a/common/lib/capa/capa/safe_exec/safe_exec.py b/common/lib/capa/capa/safe_exec/safe_exec.py index 1a33c88bca..b9cdf236bd 100644 --- a/common/lib/capa/capa/safe_exec/safe_exec.py +++ b/common/lib/capa/capa/safe_exec/safe_exec.py @@ -47,6 +47,29 @@ for name, modname in ASSUMED_IMPORTS: LAZY_IMPORTS = "".join(LAZY_IMPORTS) +def update_hash(hasher, obj): + """ + Update a `hashlib` hasher with a nested object. + + To properly cache nested structures, we need to compute a hash from the + entire structure, canonicalizing at every level. + + `hasher`'s `.update()` method is called a number of times, touching all of + `obj` in the process. Only primitive JSON-safe types are supported. + + """ + hasher.update(str(type(obj))) + if isinstance(obj, (tuple, list)): + for e in obj: + update_hash(hasher, e) + elif isinstance(obj, dict): + for k in sorted(obj): + update_hash(hasher, k) + update_hash(hasher, obj[k]) + else: + hasher.update(repr(obj)) + + @statsd.timed('capa.safe_exec.time') def safe_exec(code, globals_dict, random_seed=None, python_path=None, cache=None): """ @@ -67,10 +90,10 @@ def safe_exec(code, globals_dict, random_seed=None, python_path=None, cache=None """ # Check the cache for a previous result. if cache: - canonical_globals = sorted(json_safe(globals_dict).iteritems()) + safe_globals = json_safe(globals_dict) md5er = hashlib.md5() md5er.update(repr(code)) - md5er.update(repr(canonical_globals)) + update_hash(md5er, safe_globals) key = "safe_exec.%r.%s" % (random_seed, md5er.hexdigest()) cached = cache.get(key) if cached is not None: diff --git a/common/lib/capa/capa/safe_exec/tests/test_safe_exec.py b/common/lib/capa/capa/safe_exec/tests/test_safe_exec.py index d79140f33c..4592af8305 100644 --- a/common/lib/capa/capa/safe_exec/tests/test_safe_exec.py +++ b/common/lib/capa/capa/safe_exec/tests/test_safe_exec.py @@ -1,11 +1,12 @@ """Test safe_exec.py""" +import hashlib import os.path import random import textwrap import unittest -from capa.safe_exec import safe_exec +from capa.safe_exec import safe_exec, update_hash from codejail.safe_exec import SafeExecException @@ -145,18 +146,73 @@ class TestSafeExecCaching(unittest.TestCase): def test_unicode_submission(self): # Check that using non-ASCII unicode does not raise an encoding error. - # Try several non-ASCII unicode characters for code in [129, 500, 2**8 - 1, 2**16 - 1]: - code_with_unichr = unicode("# ") + unichr(code) - try: safe_exec(code_with_unichr, {}, cache=DictCache({})) except UnicodeEncodeError: self.fail("Tried executing code with non-ASCII unicode: {0}".format(code)) +class TestUpdateHash(unittest.TestCase): + """Test the safe_exec.update_hash function to be sure it canonicalizes properly.""" + + def hash_obj(self, obj): + """Return the md5 hash that `update_hash` makes us.""" + md5er = hashlib.md5() + update_hash(md5er, obj) + return md5er.hexdigest() + + def equal_but_different_dicts(self): + """ + Make two equal dicts with different key order. + + Simple literals won't do it. Filling one and then shrinking it will + make them different. + + """ + d1 = {k:1 for k in "abcdefghijklmnopqrstuvwxyz"} + d2 = dict(d1) + for i in xrange(10000): + d2[i] = 1 + for i in xrange(10000): + del d2[i] + + # Check that our dicts are equal, but with different key order. + self.assertEqual(d1, d2) + self.assertNotEqual(d1.keys(), d2.keys()) + + return d1, d2 + + def test_simple_cases(self): + h1 = self.hash_obj(1) + h10 = self.hash_obj(10) + hs1 = self.hash_obj("1") + + self.assertNotEqual(h1, h10) + self.assertNotEqual(h1, hs1) + + def test_list_ordering(self): + h1 = self.hash_obj({'a': [1,2,3]}) + h2 = self.hash_obj({'a': [3,2,1]}) + self.assertNotEqual(h1, h2) + + def test_dict_ordering(self): + d1, d2 = self.equal_but_different_dicts() + h1 = self.hash_obj(d1) + h2 = self.hash_obj(d2) + self.assertEqual(h1, h2) + + def test_deep_ordering(self): + d1, d2 = self.equal_but_different_dicts() + o1 = {'a':[1, 2, [d1], 3, 4]} + o2 = {'a':[1, 2, [d2], 3, 4]} + h1 = self.hash_obj(o1) + h2 = self.hash_obj(o2) + self.assertEqual(h1, h2) + + class TestRealProblems(unittest.TestCase): def test_802x(self): code = textwrap.dedent("""\