fix!: use bleach instead of custom code to sanitise forum posts (#28641)
The rest API for discussions was using custom HTML sanitisation code that matched the behaviour of the client-side library. This replaces the custom code with bleach for better security.
This commit is contained in:
@@ -4,88 +4,18 @@ Content rendering functionality
|
||||
Note that this module is designed to imitate the front end behavior as
|
||||
implemented in Markdown.Sanitizer.js.
|
||||
"""
|
||||
|
||||
|
||||
import re
|
||||
|
||||
import bleach
|
||||
import markdown
|
||||
|
||||
# These patterns could be more flexible about things like attributes and
|
||||
# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the
|
||||
# patterns defined therein.
|
||||
TAG_PATTERN = re.compile(r"<[^>]*>?")
|
||||
SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>")
|
||||
ALLOWED_BASIC_TAG_PATTERN = re.compile(
|
||||
r"^(</?(b|blockquote|code|del|dd|dl|dt|em|h1|h2|h3|i|kbd|li|ol|p|pre|s|sup|sub|strong|strike|ul)>|<(br|hr)\s?/?>)$"
|
||||
)
|
||||
ALLOWED_A_PATTERN = re.compile(
|
||||
r'^(<a\shref="((https?|ftp)://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\stitle="[^"<>]+")?\s?>|</a>)$'
|
||||
)
|
||||
ALLOWED_IMG_PATTERN = re.compile(
|
||||
r'^(<img\ssrc="(https?://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\swidth="\d{1,3}")?'
|
||||
r'(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?/?>)$'
|
||||
)
|
||||
|
||||
|
||||
def _sanitize_tag(match):
|
||||
"""Return the tag if it is allowed or the empty string otherwise"""
|
||||
tag = match.group(0)
|
||||
if (
|
||||
ALLOWED_BASIC_TAG_PATTERN.match(tag) or
|
||||
ALLOWED_A_PATTERN.match(tag) or
|
||||
ALLOWED_IMG_PATTERN.match(tag)
|
||||
):
|
||||
return tag
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def _sanitize_html(source):
|
||||
"""
|
||||
Return source with all non-allowed tags removed, preserving the text content
|
||||
"""
|
||||
return TAG_PATTERN.sub(_sanitize_tag, source)
|
||||
|
||||
|
||||
def _remove_unpaired_tags(source):
|
||||
"""
|
||||
Return source with all unpaired tags removed, preserving the text content
|
||||
|
||||
source should have already been sanitized
|
||||
"""
|
||||
tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source))
|
||||
if not tag_matches:
|
||||
return source
|
||||
tag_stack = []
|
||||
tag_name_stack = []
|
||||
text_stack = [source[:tag_matches[0].start()]]
|
||||
for i, match in enumerate(tag_matches):
|
||||
tag_name = match.group(2)
|
||||
following_text = (
|
||||
source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else
|
||||
source[match.end():]
|
||||
)
|
||||
if tag_name in ["p", "img", "br", "li", "hr"]: # tags that don't require closing
|
||||
text_stack[-1] += match.group(0) + following_text
|
||||
elif match.group(1): # end tag
|
||||
if tag_name in tag_name_stack: # paired with a start tag somewhere
|
||||
# pop tags until we find the matching one, keeping the non-tag text
|
||||
while True:
|
||||
popped_tag_name = tag_name_stack.pop()
|
||||
popped_tag = tag_stack.pop()
|
||||
popped_text = text_stack.pop()
|
||||
if popped_tag_name == tag_name:
|
||||
text_stack[-1] += popped_tag + popped_text + match.group(0)
|
||||
break
|
||||
else:
|
||||
text_stack[-1] += popped_text
|
||||
# else unpaired; drop the tag
|
||||
text_stack[-1] += following_text
|
||||
else: # start tag
|
||||
tag_stack.append(match.group(0))
|
||||
tag_name_stack.append(tag_name)
|
||||
text_stack.append(following_text)
|
||||
return "".join(text_stack)
|
||||
ALLOWED_TAGS = bleach.ALLOWED_TAGS + [
|
||||
'br', 'dd', 'del', 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 'hr', 'img', 'kbd', 'p', 'pre', 's',
|
||||
'strike', 'sub', 'sup'
|
||||
]
|
||||
ALLOWED_PROTOCOLS = ["http", "https", "ftp", "mailto"]
|
||||
ALLOWED_ATTRIBUTES = {
|
||||
"a": ["href", "title"],
|
||||
"img": ["src", "alt", "title", "width", "height"],
|
||||
}
|
||||
|
||||
|
||||
def render_body(raw_body):
|
||||
@@ -95,13 +25,17 @@ def render_body(raw_body):
|
||||
This includes the following steps:
|
||||
|
||||
* Convert Markdown to HTML
|
||||
* Strip non-whitelisted HTML
|
||||
* Remove unbalanced HTML tags
|
||||
* Sanitise HTML using bleach
|
||||
|
||||
Note that this does not prevent Markdown syntax inside a MathJax block from
|
||||
being processed, which the forums JavaScript code does.
|
||||
"""
|
||||
rendered = markdown.markdown(raw_body)
|
||||
rendered = _sanitize_html(rendered)
|
||||
rendered = _remove_unpaired_tags(rendered)
|
||||
return rendered
|
||||
rendered_html = markdown.markdown(raw_body)
|
||||
sanitised_html = bleach.clean(
|
||||
rendered_html,
|
||||
tags=ALLOWED_TAGS,
|
||||
protocols=ALLOWED_PROTOCOLS,
|
||||
strip=True,
|
||||
attributes=ALLOWED_ATTRIBUTES
|
||||
)
|
||||
return sanitised_html
|
||||
|
||||
@@ -3,9 +3,8 @@ Tests for content rendering
|
||||
"""
|
||||
|
||||
|
||||
from unittest import TestCase
|
||||
|
||||
import ddt
|
||||
from django.test import TestCase
|
||||
|
||||
from lms.djangoapps.discussion.rest_api.render import render_body
|
||||
|
||||
@@ -29,15 +28,14 @@ class RenderBodyTest(TestCase):
|
||||
)
|
||||
@ddt.unpack
|
||||
def test_markdown_inline(self, delimiter, tag):
|
||||
assert render_body('{delimiter}some text{delimiter}'.format(delimiter=delimiter)) == \
|
||||
'<p><{tag}>some text</{tag}></p>'.format(tag=tag)
|
||||
assert render_body(f'{delimiter}some text{delimiter}') == f'<p><{tag}>some text</{tag}></p>'
|
||||
|
||||
@ddt.data(
|
||||
"b", "blockquote", "code", "del", "dd", "dl", "dt", "em", "h1", "h2", "h3", "i", "kbd",
|
||||
"li", "ol", "p", "pre", "s", "sup", "sub", "strong", "strike", "ul"
|
||||
)
|
||||
def test_openclose_tag(self, tag):
|
||||
raw_body = "<{tag}>some text</{tag}>".format(tag=tag)
|
||||
raw_body = f"<{tag}>some text</{tag}>"
|
||||
is_inline_tag = tag in ["b", "code", "del", "em", "i", "kbd", "s", "sup", "sub", "strong", "strike"]
|
||||
rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body
|
||||
assert render_body(raw_body) == rendered_body
|
||||
@@ -49,40 +47,59 @@ class RenderBodyTest(TestCase):
|
||||
rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body
|
||||
assert render_body(raw_body) == rendered_body
|
||||
|
||||
@ddt.data("http", "https", "ftp")
|
||||
def test_allowed_a_tag(self, protocol):
|
||||
@ddt.data(
|
||||
("http", True),
|
||||
("https", True),
|
||||
("ftp", True),
|
||||
("gopher", False),
|
||||
("file", False),
|
||||
("data", False),
|
||||
)
|
||||
@ddt.unpack
|
||||
def test_protocols_a_tag(self, protocol, is_allowed):
|
||||
raw_body = f'<a href="{protocol}://foo" title="bar">baz</a>'
|
||||
assert render_body(raw_body) == _add_p_tags(raw_body)
|
||||
cleaned_body = '<a title="bar">baz</a>'
|
||||
rendered = render_body(raw_body)
|
||||
if is_allowed:
|
||||
assert rendered == _add_p_tags(raw_body)
|
||||
else:
|
||||
assert rendered == _add_p_tags(cleaned_body)
|
||||
|
||||
def test_disallowed_a_tag(self):
|
||||
raw_body = '<a href="gopher://foo">link content</a>'
|
||||
assert render_body(raw_body) == '<p>link content</p>'
|
||||
|
||||
@ddt.data("http", "https")
|
||||
def test_allowed_img_tag(self, protocol):
|
||||
raw_body = '<img src="{protocol}://foo" width="111" height="222" alt="bar" title="baz">'.format(
|
||||
protocol=protocol
|
||||
)
|
||||
assert render_body(raw_body) == _add_p_tags(raw_body)
|
||||
|
||||
def test_disallowed_img_tag(self):
|
||||
raw_body = '<img src="gopher://foo">'
|
||||
assert render_body(raw_body) == '<p></p>'
|
||||
@ddt.data(
|
||||
("http", True),
|
||||
("https", True),
|
||||
("gopher", False),
|
||||
("file", False),
|
||||
("data", False),
|
||||
)
|
||||
@ddt.unpack
|
||||
def test_protocols_img_tag(self, protocol, is_allowed):
|
||||
raw_body = f'<img alt="bar" height="222" src="{protocol}://foo" title="baz" width="111">'
|
||||
cleaned_body = '<img alt="bar" height="222" title="baz" width="111">'
|
||||
rendered = render_body(raw_body)
|
||||
if is_allowed:
|
||||
assert rendered == _add_p_tags(raw_body)
|
||||
else:
|
||||
assert rendered == _add_p_tags(cleaned_body)
|
||||
|
||||
def test_script_tag(self):
|
||||
raw_body = '<script type="text/javascript">alert("evil script");</script>'
|
||||
assert render_body(raw_body) == 'alert("evil script");'
|
||||
|
||||
@ddt.data("p", "br", "li", "hr") # img is tested above
|
||||
def test_allowed_unpaired_tags(self, tag):
|
||||
@ddt.data(
|
||||
("br", '<p>foo<br>bar</p>'), # br is allowed inside p
|
||||
("li", '<p>foo</p><li>bar<p></p></li>'), # unpaired li only allowed if followed by another li
|
||||
("hr", '<p>foo</p><hr>bar<p></p>'), # hr is not allowed inside p
|
||||
("img", '<p>foo<img>bar</p>'), # unpaired img allowed, empty img doesn't render
|
||||
("i", '<p>foo<i>bar</i></p>'),
|
||||
)
|
||||
@ddt.unpack
|
||||
def test_unpaired_tags(self, tag, rendered_output):
|
||||
raw_body = f"foo<{tag}>bar"
|
||||
assert render_body(raw_body) == _add_p_tags(raw_body)
|
||||
|
||||
def test_unpaired_start_tag(self):
|
||||
assert render_body('foo<i>bar') == '<p>foobar</p>'
|
||||
|
||||
def test_unpaired_end_tag(self):
|
||||
assert render_body('foo</i>bar') == '<p>foobar</p>'
|
||||
assert render_body(raw_body) == rendered_output
|
||||
|
||||
def test_interleaved_tags(self):
|
||||
assert render_body('foo<i>bar<b>baz</i>quux</b>greg') == '<p>foo<i>barbaz</i>quuxgreg</p>'
|
||||
self.assertHTMLEqual(
|
||||
render_body('foo<i>bar<b>baz</i>quux</b>greg'),
|
||||
'<p>foo<i>bar<b>baz</b></i><b>quux</b>greg</p>',
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user