From c0e0318f080ca43b441e63b1adbb9aba5abb4e51 Mon Sep 17 00:00:00 2001 From: Awais Jibran Date: Mon, 23 Sep 2019 18:55:01 +0500 Subject: [PATCH] Prevent unicode error This PR prevents `UnicodeDecodeError` when replacing unicode with bytes string. Example: PROD-680 --- common/lib/capa/capa/tests/test_util.py | 33 ++++++++++++++++++++++++- common/lib/capa/capa/util.py | 19 +++++++++----- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/common/lib/capa/capa/tests/test_util.py b/common/lib/capa/capa/tests/test_util.py index ed910146ef..3ebd40e80f 100644 --- a/common/lib/capa/capa/tests/test_util.py +++ b/common/lib/capa/capa/tests/test_util.py @@ -1,3 +1,4 @@ +# coding=utf-8 """ Tests capa util """ @@ -5,14 +6,23 @@ from __future__ import absolute_import import unittest +import ddt from lxml import etree from capa.tests.helpers import test_capa_system -from capa.util import compare_with_tolerance, get_inner_html_from_xpath, remove_markup, sanitize_html +from capa.util import ( + compare_with_tolerance, + contextualize_text, + get_inner_html_from_xpath, + remove_markup, + sanitize_html +) +@ddt.ddt class UtilTest(unittest.TestCase): """Tests for util""" + def setUp(self): super(UtilTest, self).setUp() self.system = test_capa_system() @@ -138,3 +148,24 @@ class UtilTest(unittest.TestCase): remove_markup("The Truth is Out There & you need to find it"), "The Truth is Out There & you need to find it" ) + + @ddt.data( + 'When the root level failš the whole hierarchy won’t work anymore.', + 'あなたあなたあなた' + ) + def test_contextualize_text(self, context_value): + """Verify that variable substitution works as intended with non-ascii characters.""" + key = 'answer0' + text = '$answer0' + context = {key: context_value} + contextual_text = contextualize_text(text, context) + self.assertEqual(context_value, contextual_text) + + def test_contextualize_text_with_non_ascii_context(self): + """Verify that variable substitution works as intended with non-ascii characters.""" + key = u'あなた$a $b' + text = '$' + key + context = {'a': u'あなたあなたあなた', 'b': u'あなたhi'} + expected_text = '$あなたあなたあなたあなた あなたhi' + contextual_text = contextualize_text(text, context) + self.assertEqual(expected_text, contextual_text) diff --git a/common/lib/capa/capa/util.py b/common/lib/capa/capa/util.py index f7d8f5b466..2b700e4c18 100644 --- a/common/lib/capa/capa/util.py +++ b/common/lib/capa/capa/util.py @@ -100,20 +100,27 @@ def contextualize_text(text, context): # private Takes a string with variables. E.g. $a+$b. Does a substitution of those variables from the context """ + def convert_to_str(value): + """The method tries to convert unicode/non-ascii values into string""" + try: + return str(value) + except UnicodeEncodeError: + return value.encode('utf8', errors='ignore') + if not text: return text + for key in sorted(context, key=len, reverse=True): # TODO (vshnayder): This whole replacement thing is a big hack # right now--context contains not just the vars defined in the # program, but also e.g. a reference to the numpy module. # Should be a separate dict of variables that should be # replaced. - if '$' + key in text: - try: - s = str(context[key]) - except UnicodeEncodeError: - s = context[key].encode('utf8', errors='ignore') - text = text.replace('$' + key, s) + context_key = '$' + key + if context_key in text: + text = convert_to_str(text) + context_value = convert_to_str(context[key]) + text = text.replace(context_key, context_value) return text