From c0e0318f080ca43b441e63b1adbb9aba5abb4e51 Mon Sep 17 00:00:00 2001
From: Awais Jibran <awaisdar001@gmail.com>
Date: Mon, 23 Sep 2019 18:55:01 +0500
Subject: [PATCH] Prevent unicode error

This PR prevents `UnicodeDecodeError` when replacing unicode with bytes string.

Example:

PROD-680
---
 common/lib/capa/capa/tests/test_util.py | 33 ++++++++++++++++++++++++-
 common/lib/capa/capa/util.py            | 19 +++++++++-----
 2 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/common/lib/capa/capa/tests/test_util.py b/common/lib/capa/capa/tests/test_util.py
index ed910146ef..3ebd40e80f 100644
--- a/common/lib/capa/capa/tests/test_util.py
+++ b/common/lib/capa/capa/tests/test_util.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 """
 Tests capa util
 """
@@ -5,14 +6,23 @@ from __future__ import absolute_import
 
 import unittest
 
+import ddt
 from lxml import etree
 
 from capa.tests.helpers import test_capa_system
-from capa.util import compare_with_tolerance, get_inner_html_from_xpath, remove_markup, sanitize_html
+from capa.util import (
+    compare_with_tolerance,
+    contextualize_text,
+    get_inner_html_from_xpath,
+    remove_markup,
+    sanitize_html
+)
 
 
+@ddt.ddt
 class UtilTest(unittest.TestCase):
     """Tests for util"""
+
     def setUp(self):
         super(UtilTest, self).setUp()
         self.system = test_capa_system()
@@ -138,3 +148,24 @@ class UtilTest(unittest.TestCase):
             remove_markup("The <mark>Truth</mark> is <em>Out There</em> & you need to <strong>find</strong> it"),
             "The Truth is Out There &amp; you need to find it"
         )
+
+    @ddt.data(
+        'When the root level failš the whole hierarchy won’t work anymore.',
+        'あなたあなたあなた'
+    )
+    def test_contextualize_text(self, context_value):
+        """Verify that variable substitution works as intended with non-ascii characters."""
+        key = 'answer0'
+        text = '$answer0'
+        context = {key: context_value}
+        contextual_text = contextualize_text(text, context)
+        self.assertEqual(context_value, contextual_text)
+
+    def test_contextualize_text_with_non_ascii_context(self):
+        """Verify that variable substitution works as intended with non-ascii characters."""
+        key = u'あなた$a $b'
+        text = '$' + key
+        context = {'a': u'あなたあなたあなた', 'b': u'あなたhi'}
+        expected_text = '$あなたあなたあなたあなた あなたhi'
+        contextual_text = contextualize_text(text, context)
+        self.assertEqual(expected_text, contextual_text)
diff --git a/common/lib/capa/capa/util.py b/common/lib/capa/capa/util.py
index f7d8f5b466..2b700e4c18 100644
--- a/common/lib/capa/capa/util.py
+++ b/common/lib/capa/capa/util.py
@@ -100,20 +100,27 @@ def contextualize_text(text, context):  # private
     Takes a string with variables. E.g. $a+$b.
     Does a substitution of those variables from the context
     """
+    def convert_to_str(value):
+        """The method tries to convert unicode/non-ascii values into string"""
+        try:
+            return str(value)
+        except UnicodeEncodeError:
+            return value.encode('utf8', errors='ignore')
+
     if not text:
         return text
+
     for key in sorted(context, key=len, reverse=True):
         # TODO (vshnayder): This whole replacement thing is a big hack
         # right now--context contains not just the vars defined in the
         # program, but also e.g. a reference to the numpy module.
         # Should be a separate dict of variables that should be
         # replaced.
-        if '$' + key in text:
-            try:
-                s = str(context[key])
-            except UnicodeEncodeError:
-                s = context[key].encode('utf8', errors='ignore')
-            text = text.replace('$' + key, s)
+        context_key = '$' + key
+        if context_key in text:
+            text = convert_to_str(text)
+            context_value = convert_to_str(context[key])
+            text = text.replace(context_key, context_value)
     return text