fix!: use bleach instead of custom code to sanitise forum posts (#28641)

The rest API for discussions was using custom HTML sanitisation code that
matched the behaviour of the client-side library. This replaces the custom
code with bleach for better security.
This commit is contained in:
Kshitij Sobti
2021-09-23 06:37:05 +00:00
committed by GitHub
parent 6f0255bc60
commit 32748788bf
2 changed files with 69 additions and 118 deletions

View File

@@ -4,88 +4,18 @@ Content rendering functionality
Note that this module is designed to imitate the front end behavior as
implemented in Markdown.Sanitizer.js.
"""
import re
import bleach
import markdown
# These patterns could be more flexible about things like attributes and
# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the
# patterns defined therein.
TAG_PATTERN = re.compile(r"<[^>]*>?")
SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>")
ALLOWED_BASIC_TAG_PATTERN = re.compile(
r"^(</?(b|blockquote|code|del|dd|dl|dt|em|h1|h2|h3|i|kbd|li|ol|p|pre|s|sup|sub|strong|strike|ul)>|<(br|hr)\s?/?>)$"
)
ALLOWED_A_PATTERN = re.compile(
r'^(<a\shref="((https?|ftp)://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\stitle="[^"<>]+")?\s?>|</a>)$'
)
ALLOWED_IMG_PATTERN = re.compile(
r'^(<img\ssrc="(https?://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\swidth="\d{1,3}")?'
r'(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?/?>)$'
)
def _sanitize_tag(match):
"""Return the tag if it is allowed or the empty string otherwise"""
tag = match.group(0)
if (
ALLOWED_BASIC_TAG_PATTERN.match(tag) or
ALLOWED_A_PATTERN.match(tag) or
ALLOWED_IMG_PATTERN.match(tag)
):
return tag
else:
return ""
def _sanitize_html(source):
"""
Return source with all non-allowed tags removed, preserving the text content
"""
return TAG_PATTERN.sub(_sanitize_tag, source)
def _remove_unpaired_tags(source):
"""
Return source with all unpaired tags removed, preserving the text content
source should have already been sanitized
"""
tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source))
if not tag_matches:
return source
tag_stack = []
tag_name_stack = []
text_stack = [source[:tag_matches[0].start()]]
for i, match in enumerate(tag_matches):
tag_name = match.group(2)
following_text = (
source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else
source[match.end():]
)
if tag_name in ["p", "img", "br", "li", "hr"]: # tags that don't require closing
text_stack[-1] += match.group(0) + following_text
elif match.group(1): # end tag
if tag_name in tag_name_stack: # paired with a start tag somewhere
# pop tags until we find the matching one, keeping the non-tag text
while True:
popped_tag_name = tag_name_stack.pop()
popped_tag = tag_stack.pop()
popped_text = text_stack.pop()
if popped_tag_name == tag_name:
text_stack[-1] += popped_tag + popped_text + match.group(0)
break
else:
text_stack[-1] += popped_text
# else unpaired; drop the tag
text_stack[-1] += following_text
else: # start tag
tag_stack.append(match.group(0))
tag_name_stack.append(tag_name)
text_stack.append(following_text)
return "".join(text_stack)
ALLOWED_TAGS = bleach.ALLOWED_TAGS + [
'br', 'dd', 'del', 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 'hr', 'img', 'kbd', 'p', 'pre', 's',
'strike', 'sub', 'sup'
]
ALLOWED_PROTOCOLS = ["http", "https", "ftp", "mailto"]
ALLOWED_ATTRIBUTES = {
"a": ["href", "title"],
"img": ["src", "alt", "title", "width", "height"],
}
def render_body(raw_body):
@@ -95,13 +25,17 @@ def render_body(raw_body):
This includes the following steps:
* Convert Markdown to HTML
* Strip non-whitelisted HTML
* Remove unbalanced HTML tags
* Sanitise HTML using bleach
Note that this does not prevent Markdown syntax inside a MathJax block from
being processed, which the forums JavaScript code does.
"""
rendered = markdown.markdown(raw_body)
rendered = _sanitize_html(rendered)
rendered = _remove_unpaired_tags(rendered)
return rendered
rendered_html = markdown.markdown(raw_body)
sanitised_html = bleach.clean(
rendered_html,
tags=ALLOWED_TAGS,
protocols=ALLOWED_PROTOCOLS,
strip=True,
attributes=ALLOWED_ATTRIBUTES
)
return sanitised_html

View File

@@ -3,9 +3,8 @@ Tests for content rendering
"""
from unittest import TestCase
import ddt
from django.test import TestCase
from lms.djangoapps.discussion.rest_api.render import render_body
@@ -29,15 +28,14 @@ class RenderBodyTest(TestCase):
)
@ddt.unpack
def test_markdown_inline(self, delimiter, tag):
assert render_body('{delimiter}some text{delimiter}'.format(delimiter=delimiter)) == \
'<p><{tag}>some text</{tag}></p>'.format(tag=tag)
assert render_body(f'{delimiter}some text{delimiter}') == f'<p><{tag}>some text</{tag}></p>'
@ddt.data(
"b", "blockquote", "code", "del", "dd", "dl", "dt", "em", "h1", "h2", "h3", "i", "kbd",
"li", "ol", "p", "pre", "s", "sup", "sub", "strong", "strike", "ul"
)
def test_openclose_tag(self, tag):
raw_body = "<{tag}>some text</{tag}>".format(tag=tag)
raw_body = f"<{tag}>some text</{tag}>"
is_inline_tag = tag in ["b", "code", "del", "em", "i", "kbd", "s", "sup", "sub", "strong", "strike"]
rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body
assert render_body(raw_body) == rendered_body
@@ -49,40 +47,59 @@ class RenderBodyTest(TestCase):
rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body
assert render_body(raw_body) == rendered_body
@ddt.data("http", "https", "ftp")
def test_allowed_a_tag(self, protocol):
@ddt.data(
("http", True),
("https", True),
("ftp", True),
("gopher", False),
("file", False),
("data", False),
)
@ddt.unpack
def test_protocols_a_tag(self, protocol, is_allowed):
raw_body = f'<a href="{protocol}://foo" title="bar">baz</a>'
assert render_body(raw_body) == _add_p_tags(raw_body)
cleaned_body = '<a title="bar">baz</a>'
rendered = render_body(raw_body)
if is_allowed:
assert rendered == _add_p_tags(raw_body)
else:
assert rendered == _add_p_tags(cleaned_body)
def test_disallowed_a_tag(self):
raw_body = '<a href="gopher://foo">link content</a>'
assert render_body(raw_body) == '<p>link content</p>'
@ddt.data("http", "https")
def test_allowed_img_tag(self, protocol):
raw_body = '<img src="{protocol}://foo" width="111" height="222" alt="bar" title="baz">'.format(
protocol=protocol
)
assert render_body(raw_body) == _add_p_tags(raw_body)
def test_disallowed_img_tag(self):
raw_body = '<img src="gopher://foo">'
assert render_body(raw_body) == '<p></p>'
@ddt.data(
("http", True),
("https", True),
("gopher", False),
("file", False),
("data", False),
)
@ddt.unpack
def test_protocols_img_tag(self, protocol, is_allowed):
raw_body = f'<img alt="bar" height="222" src="{protocol}://foo" title="baz" width="111">'
cleaned_body = '<img alt="bar" height="222" title="baz" width="111">'
rendered = render_body(raw_body)
if is_allowed:
assert rendered == _add_p_tags(raw_body)
else:
assert rendered == _add_p_tags(cleaned_body)
def test_script_tag(self):
raw_body = '<script type="text/javascript">alert("evil script");</script>'
assert render_body(raw_body) == 'alert("evil script");'
@ddt.data("p", "br", "li", "hr") # img is tested above
def test_allowed_unpaired_tags(self, tag):
@ddt.data(
("br", '<p>foo<br>bar</p>'), # br is allowed inside p
("li", '<p>foo</p><li>bar<p></p></li>'), # unpaired li only allowed if followed by another li
("hr", '<p>foo</p><hr>bar<p></p>'), # hr is not allowed inside p
("img", '<p>foo<img>bar</p>'), # unpaired img allowed, empty img doesn't render
("i", '<p>foo<i>bar</i></p>'),
)
@ddt.unpack
def test_unpaired_tags(self, tag, rendered_output):
raw_body = f"foo<{tag}>bar"
assert render_body(raw_body) == _add_p_tags(raw_body)
def test_unpaired_start_tag(self):
assert render_body('foo<i>bar') == '<p>foobar</p>'
def test_unpaired_end_tag(self):
assert render_body('foo</i>bar') == '<p>foobar</p>'
assert render_body(raw_body) == rendered_output
def test_interleaved_tags(self):
assert render_body('foo<i>bar<b>baz</i>quux</b>greg') == '<p>foo<i>barbaz</i>quuxgreg</p>'
self.assertHTMLEqual(
render_body('foo<i>bar<b>baz</i>quux</b>greg'),
'<p>foo<i>bar<b>baz</b></i><b>quux</b>greg</p>',
)