use bleach instead of lxml.html.clean for sanitize_html OEE
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
from lxml.html.clean import Cleaner, autolink_html
|
||||
import re
|
||||
import bleach
|
||||
from xmodule.progress import Progress
|
||||
import capa.xqueue_interface as xqueue_interface
|
||||
from capa.util import *
|
||||
@@ -50,24 +50,14 @@ def upload_to_s3(file_to_upload, keyname, s3_interface):
|
||||
|
||||
return public_url
|
||||
|
||||
class WhiteListCleaner(Cleaner):
|
||||
"""
|
||||
By default, lxml cleaner strips out all links that are not in a defined whitelist.
|
||||
We want to allow all links, and rely on the peer grading flagging mechanic to catch
|
||||
the "bad" ones. So, don't define a whitelist at all.
|
||||
"""
|
||||
def allow_embedded_url(self, el, url):
|
||||
"""
|
||||
Override the Cleaner allow_embedded_url method to remove the whitelist url requirement.
|
||||
Ensure that any tags not in the whitelist are stripped beforehand.
|
||||
"""
|
||||
|
||||
# Tell cleaner to strip any element with a tag that isn't whitelisted.
|
||||
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
|
||||
return False
|
||||
|
||||
# Tell cleaner to allow all urls.
|
||||
return True
|
||||
# Used by sanitize_html
|
||||
ALLOWED_HTML_ATTRS = {
|
||||
'*': ['id', 'class', 'height', 'width', 'alt'],
|
||||
'a': ['href', 'title', 'rel'],
|
||||
'embed': ['src'],
|
||||
'iframe': ['src'],
|
||||
'img': ['src'],
|
||||
}
|
||||
|
||||
|
||||
class OpenEndedChild(object):
|
||||
@@ -228,22 +218,19 @@ class OpenEndedChild(object):
|
||||
answer - any string
|
||||
return - a cleaned version of the string
|
||||
"""
|
||||
try:
|
||||
answer = autolink_html(answer)
|
||||
cleaner = WhiteListCleaner(
|
||||
style=True,
|
||||
links=True,
|
||||
add_nofollow=False,
|
||||
page_structure=True,
|
||||
safe_attrs_only=True,
|
||||
whitelist_tags=('embed', 'iframe', 'a', 'img', 'br',)
|
||||
)
|
||||
clean_html = cleaner.clean_html(answer)
|
||||
clean_html = re.sub(r'</p>$', '', re.sub(r'^<p>', '', clean_html))
|
||||
clean_html = re.sub("\n","<br/>", clean_html)
|
||||
except Exception:
|
||||
clean_html = answer
|
||||
return clean_html
|
||||
clean_html = bleach.clean(answer,
|
||||
tags=['embed', 'iframe', 'a', 'img', 'br'],
|
||||
attributes=ALLOWED_HTML_ATTRS,
|
||||
strip=True)
|
||||
return OpenEndedChild.replace_newlines(clean_html)
|
||||
|
||||
@staticmethod
|
||||
def replace_newlines(html):
|
||||
"""
|
||||
Replaces "\n" newlines with <br/>
|
||||
"""
|
||||
retv = re.sub(r'</p>$', '', re.sub(r'^<p>', '', html))
|
||||
return re.sub("\n","<br/>", retv)
|
||||
|
||||
def new_history_entry(self, answer):
|
||||
"""
|
||||
|
||||
@@ -1001,3 +1001,65 @@ class OpenEndedModuleXmlImageUploadTest(unittest.TestCase, DummyModulestore):
|
||||
self.assertTrue(response['success'])
|
||||
self.assertIn(self.answer_link, response['student_response'])
|
||||
self.assertIn(self.autolink_tag, response['student_response'])
|
||||
|
||||
|
||||
class OpenEndedModuleUtilTest(unittest.TestCase):
|
||||
"""
|
||||
Tests for the util functions of OpenEndedModule. Currently just for the html_sanitizer and <br/> inserter
|
||||
"""
|
||||
script_dirty = u'<script>alert("xss!")</script>'
|
||||
script_clean = u'alert("xss!")'
|
||||
img_dirty = u'<img alt="cats" height="200" onclick="eval()" src="http://example.com/lolcats.jpg" width="200">'
|
||||
img_clean = u'<img alt="cats" height="200" src="http://example.com/lolcats.jpg" width="200">'
|
||||
embed_dirty = u'<embed height="200" id="cats" onhover="eval()" src="http://example.com/lolcats.swf" width="200">'
|
||||
embed_clean = u'<embed height="200" id="cats" src="http://example.com/lolcats.swf" width="200">'
|
||||
iframe_dirty = u'<img class="cats" height="200" onerror="eval()" src="http://example.com/lolcats" width="200">'
|
||||
iframe_clean = u'<img class="cats" height="200" src="http://example.com/lolcats" width="200">'
|
||||
|
||||
text = u'I am a \u201c\xfcber student\u201d'
|
||||
text_lessthan_noencd = u'This used to be broken < by the other parser. 3>5'
|
||||
text_lessthan_encode = u'This used to be broken < by the other parser. 3>5'
|
||||
text_linebreaks = u"St\xfcdent submission:\nI like lamp."
|
||||
text_brs = u"St\xfcdent submission:<br/>I like lamp."
|
||||
|
||||
def test_script(self):
|
||||
"""
|
||||
Basic test for stripping <script>
|
||||
"""
|
||||
self.assertEqual(OpenEndedChild.sanitize_html(self.script_dirty), self.script_clean)
|
||||
|
||||
def test_img(self):
|
||||
"""
|
||||
Basic test for passing through img, but stripping bad attr
|
||||
"""
|
||||
self.assertEqual(OpenEndedChild.sanitize_html(self.img_dirty), self.img_clean)
|
||||
|
||||
def test_embed(self):
|
||||
"""
|
||||
Basic test for passing through embed, but stripping bad attr
|
||||
"""
|
||||
self.assertEqual(OpenEndedChild.sanitize_html(self.embed_dirty), self.embed_clean)
|
||||
|
||||
def test_iframe(self):
|
||||
"""
|
||||
Basic test for passing through iframe, but stripping bad attr
|
||||
"""
|
||||
self.assertEqual(OpenEndedChild.sanitize_html(self.iframe_dirty), self.iframe_clean)
|
||||
|
||||
def test_text(self):
|
||||
"""
|
||||
Test for passing through text unchanged, including unicode
|
||||
"""
|
||||
self.assertEqual(OpenEndedChild.sanitize_html(self.text), self.text)
|
||||
|
||||
def test_lessthan(self):
|
||||
"""
|
||||
Tests that `<` in text context is handled properly
|
||||
"""
|
||||
self.assertEqual(OpenEndedChild.sanitize_html(self.text_lessthan_noencd), self.text_lessthan_encode)
|
||||
|
||||
def test_linebreaks(self):
|
||||
"""
|
||||
tests the replace_newlines function
|
||||
"""
|
||||
self.assertEqual(OpenEndedChild.replace_newlines(self.text_linebreaks), self.text_brs)
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
|
||||
beautifulsoup4==4.1.3
|
||||
beautifulsoup==3.2.1
|
||||
bleach==1.2.2
|
||||
html5lib==0.95
|
||||
boto==2.6.0
|
||||
celery==3.0.19
|
||||
dealer==0.2.3
|
||||
|
||||
Reference in New Issue
Block a user