Prevent unicode error

This PR prevents `UnicodeDecodeError` when replacing unicode with bytes string. Example: PROD-680
2019-09-23 18:55:01 +05:00
parent a51c362561
commit c0e0318f08
2 changed files with 45 additions and 7 deletions
--- a/common/lib/capa/capa/tests/test_util.py
+++ b/common/lib/capa/capa/tests/test_util.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 """
 Tests capa util
 """
@@ -5,14 +6,23 @@ from __future__ import absolute_import

 import unittest

+import ddt
 from lxml import etree

 from capa.tests.helpers import test_capa_system
-from capa.util import compare_with_tolerance, get_inner_html_from_xpath, remove_markup, sanitize_html
+from capa.util import (
+    compare_with_tolerance,
+    contextualize_text,
+    get_inner_html_from_xpath,
+    remove_markup,
+    sanitize_html
+)


+@ddt.ddt
 class UtilTest(unittest.TestCase):
    """Tests for util"""
+
    def setUp(self):
        super(UtilTest, self).setUp()
        self.system = test_capa_system()
@@ -138,3 +148,24 @@ class UtilTest(unittest.TestCase):
            remove_markup("The <mark>Truth</mark> is <em>Out There</em> & you need to <strong>find</strong> it"),
            "The Truth is Out There &amp; you need to find it"
        )
+
+    @ddt.data(
+        'When the root level failš the whole hierarchy won’t work anymore.',
+        'あなたあなたあなた'
+    )
+    def test_contextualize_text(self, context_value):
+        """Verify that variable substitution works as intended with non-ascii characters."""
+        key = 'answer0'
+        text = '$answer0'
+        context = {key: context_value}
+        contextual_text = contextualize_text(text, context)
+        self.assertEqual(context_value, contextual_text)
+
+    def test_contextualize_text_with_non_ascii_context(self):
+        """Verify that variable substitution works as intended with non-ascii characters."""
+        key = u'あなた$a $b'
+        text = '$' + key
+        context = {'a': u'あなたあなたあなた', 'b': u'あなたhi'}
+        expected_text = '$あなたあなたあなたあなた あなたhi'
+        contextual_text = contextualize_text(text, context)
+        self.assertEqual(expected_text, contextual_text)
--- a/common/lib/capa/capa/util.py
+++ b/common/lib/capa/capa/util.py
@@ -100,20 +100,27 @@ def contextualize_text(text, context):  # private
    Takes a string with variables. E.g. $a+$b.
    Does a substitution of those variables from the context
    """
+    def convert_to_str(value):
+        """The method tries to convert unicode/non-ascii values into string"""
+        try:
+            return str(value)
+        except UnicodeEncodeError:
+            return value.encode('utf8', errors='ignore')
+
    if not text:
        return text
+
    for key in sorted(context, key=len, reverse=True):
        # TODO (vshnayder): This whole replacement thing is a big hack
        # right now--context contains not just the vars defined in the
        # program, but also e.g. a reference to the numpy module.
        # Should be a separate dict of variables that should be
        # replaced.
-        if '$' + key in text:
-            try:
-                s = str(context[key])
-            except UnicodeEncodeError:
-                s = context[key].encode('utf8', errors='ignore')
-            text = text.replace('$' + key, s)
+        context_key = '$' + key
+        if context_key in text:
+            text = convert_to_str(text)
+            context_value = convert_to_str(context[key])
+            text = text.replace(context_key, context_value)
    return text