use bleach instead of lxml.html.clean for sanitize_html OEE

2013-10-21 17:06:32 -07:00
parent 33720a85bf
commit 79ce043219
3 changed files with 86 additions and 35 deletions
--- a/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py
+++ b/common/lib/xmodule/xmodule/open_ended_grading_classes/openendedchild.py
@@ -1,7 +1,7 @@
 import json
 import logging
-from lxml.html.clean import Cleaner, autolink_html
 import re
+import bleach
 from xmodule.progress import Progress
 import capa.xqueue_interface as xqueue_interface
 from capa.util import *
@@ -50,24 +50,14 @@ def upload_to_s3(file_to_upload, keyname, s3_interface):

    return public_url

-class WhiteListCleaner(Cleaner):
-    """
-    By default, lxml cleaner strips out all links that are not in a defined whitelist.
-    We want to allow all links, and rely on the peer grading flagging mechanic to catch
-    the "bad" ones.  So, don't define a whitelist at all.
-    """
-    def allow_embedded_url(self, el, url):
-        """
-        Override the Cleaner allow_embedded_url method to remove the whitelist url requirement.
-        Ensure that any tags not in the whitelist are stripped beforehand.
-        """
-
-        # Tell cleaner to strip any element with a tag that isn't whitelisted.
-        if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
-            return False
-
-        # Tell cleaner to allow all urls.
-        return True
+# Used by sanitize_html
+ALLOWED_HTML_ATTRS = {
+    '*': ['id', 'class', 'height', 'width', 'alt'],
+    'a': ['href', 'title', 'rel'],
+    'embed': ['src'],
+    'iframe': ['src'],
+    'img': ['src'],
+}


 class OpenEndedChild(object):
@@ -228,22 +218,19 @@ class OpenEndedChild(object):
        answer - any string
        return - a cleaned version of the string
        """
-        try:
-            answer = autolink_html(answer)
-            cleaner = WhiteListCleaner(
-                style=True,
-                links=True,
-                add_nofollow=False,
-                page_structure=True,
-                safe_attrs_only=True,
-                whitelist_tags=('embed', 'iframe', 'a', 'img', 'br',)
-            )
-            clean_html = cleaner.clean_html(answer)
-            clean_html = re.sub(r'</p>$', '', re.sub(r'^<p>', '', clean_html))
-            clean_html = re.sub("\n","<br/>", clean_html)
-        except Exception:
-            clean_html = answer
-        return clean_html
+        clean_html = bleach.clean(answer,
+                                  tags=['embed', 'iframe', 'a', 'img', 'br'],
+                                  attributes=ALLOWED_HTML_ATTRS,
+                                  strip=True)
+        return OpenEndedChild.replace_newlines(clean_html)
+
+    @staticmethod
+    def replace_newlines(html):
+        """
+        Replaces "\n" newlines with <br/>
+        """
+        retv = re.sub(r'</p>$', '', re.sub(r'^<p>', '', html))
+        return re.sub("\n","<br/>", retv)

    def new_history_entry(self, answer):
        """
--- a/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py
+++ b/common/lib/xmodule/xmodule/tests/test_combined_open_ended.py
@@ -1001,3 +1001,65 @@ class OpenEndedModuleXmlImageUploadTest(unittest.TestCase, DummyModulestore):
        self.assertTrue(response['success'])
        self.assertIn(self.answer_link, response['student_response'])
        self.assertIn(self.autolink_tag, response['student_response'])
+
+
+class OpenEndedModuleUtilTest(unittest.TestCase):
+    """
+    Tests for the util functions of OpenEndedModule.  Currently just for the html_sanitizer and <br/> inserter
+    """
+    script_dirty = u'<script>alert("xss!")</script>'
+    script_clean = u'alert("xss!")'
+    img_dirty = u'<img alt="cats" height="200" onclick="eval()" src="http://example.com/lolcats.jpg" width="200">'
+    img_clean = u'<img alt="cats" height="200" src="http://example.com/lolcats.jpg" width="200">'
+    embed_dirty = u'<embed height="200" id="cats" onhover="eval()" src="http://example.com/lolcats.swf" width="200">'
+    embed_clean = u'<embed height="200" id="cats" src="http://example.com/lolcats.swf" width="200">'
+    iframe_dirty = u'<img class="cats" height="200" onerror="eval()" src="http://example.com/lolcats" width="200">'
+    iframe_clean = u'<img class="cats" height="200" src="http://example.com/lolcats" width="200">'
+
+    text = u'I am a \u201c\xfcber student\u201d'
+    text_lessthan_noencd = u'This used to be broken < by the other parser. 3>5'
+    text_lessthan_encode = u'This used to be broken &lt; by the other parser. 3&gt;5'
+    text_linebreaks = u"St\xfcdent submission:\nI like lamp."
+    text_brs = u"St\xfcdent submission:<br/>I like lamp."
+
+    def test_script(self):
+        """
+        Basic test for stripping <script>
+        """
+        self.assertEqual(OpenEndedChild.sanitize_html(self.script_dirty), self.script_clean)
+
+    def test_img(self):
+        """
+        Basic test for passing through img, but stripping bad attr
+        """
+        self.assertEqual(OpenEndedChild.sanitize_html(self.img_dirty), self.img_clean)
+
+    def test_embed(self):
+        """
+        Basic test for passing through embed, but stripping bad attr
+        """
+        self.assertEqual(OpenEndedChild.sanitize_html(self.embed_dirty), self.embed_clean)
+
+    def test_iframe(self):
+        """
+        Basic test for passing through iframe, but stripping bad attr
+        """
+        self.assertEqual(OpenEndedChild.sanitize_html(self.iframe_dirty), self.iframe_clean)
+
+    def test_text(self):
+        """
+        Test for passing through text unchanged, including unicode
+        """
+        self.assertEqual(OpenEndedChild.sanitize_html(self.text), self.text)
+
+    def test_lessthan(self):
+        """
+        Tests that `<` in text context is handled properly
+        """
+        self.assertEqual(OpenEndedChild.sanitize_html(self.text_lessthan_noencd), self.text_lessthan_encode)
+
+    def test_linebreaks(self):
+        """
+        tests the replace_newlines function
+        """
+        self.assertEqual(OpenEndedChild.replace_newlines(self.text_linebreaks), self.text_brs)
--- a/requirements/edx/base.txt
+++ b/requirements/edx/base.txt
@@ -8,6 +8,8 @@

 beautifulsoup4==4.1.3
 beautifulsoup==3.2.1
+bleach==1.2.2
+html5lib==0.95
 boto==2.6.0
 celery==3.0.19
 dealer==0.2.3