Files
edx-platform/lms/djangoapps/discussion_api/render.py
Greg Price c28b295806 Add rendered_body to discussion API endpoints
The rendering in this commit does not prevent MathJax content from being
rendered as the discussion JavaScript code does.
2015-06-17 20:23:30 -04:00

106 lines
3.6 KiB
Python

"""
Content rendering functionality
Note that this module is designed to imitate the front end behavior as
implemented in Markdown.Sanitizer.js.
"""
import re
import markdown
# These patterns could be more flexible about things like attributes and
# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the
# patterns defined therein.
TAG_PATTERN = re.compile(r"<[^>]*>?")
SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>")
ALLOWED_BASIC_TAG_PATTERN = re.compile(
r"^(</?(b|blockquote|code|del|dd|dl|dt|em|h1|h2|h3|i|kbd|li|ol|p|pre|s|sup|sub|strong|strike|ul)>|<(br|hr)\s?/?>)$"
)
ALLOWED_A_PATTERN = re.compile(
r'^(<a\shref="((https?|ftp)://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\stitle="[^"<>]+")?\s?>|</a>)$'
)
ALLOWED_IMG_PATTERN = re.compile(
r'^(<img\ssrc="(https?://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\swidth="\d{1,3}")?'
r'(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?/?>)$'
)
def _sanitize_tag(match):
"""Return the tag if it is allowed or the empty string otherwise"""
tag = match.group(0)
if (
ALLOWED_BASIC_TAG_PATTERN.match(tag) or
ALLOWED_A_PATTERN.match(tag) or
ALLOWED_IMG_PATTERN.match(tag)
):
return tag
else:
return ""
def _sanitize_html(source):
"""
Return source with all non-allowed tags removed, preserving the text content
"""
return TAG_PATTERN.sub(_sanitize_tag, source)
def _remove_unpaired_tags(source):
"""
Return source with all unpaired tags removed, preserving the text content
source should have already been sanitized
"""
tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source))
if not tag_matches:
return source
tag_stack = []
tag_name_stack = []
text_stack = [source[:tag_matches[0].start()]]
for i, match in enumerate(tag_matches):
tag_name = match.group(2)
following_text = (
source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else
source[match.end():]
)
if tag_name in ["p", "img", "br", "li", "hr"]: # tags that don't require closing
text_stack[-1] += match.group(0) + following_text
elif match.group(1): # end tag
if tag_name in tag_name_stack: # paired with a start tag somewhere
# pop tags until we find the matching one, keeping the non-tag text
while True:
popped_tag_name = tag_name_stack.pop()
popped_tag = tag_stack.pop()
popped_text = text_stack.pop()
if popped_tag_name == tag_name:
text_stack[-1] += popped_tag + popped_text + match.group(0)
break
else:
text_stack[-1] += popped_text
# else unpaired; drop the tag
text_stack[-1] += following_text
else: # start tag
tag_stack.append(match.group(0))
tag_name_stack.append(tag_name)
text_stack.append(following_text)
return "".join(text_stack)
def render_body(raw_body):
"""
Render raw_body to HTML.
This includes the following steps:
* Convert Markdown to HTML
* Strip non-whitelisted HTML
* Remove unbalanced HTML tags
Note that this does not prevent Markdown syntax inside a MathJax block from
being processed, which the forums JavaScript code does.
"""
rendered = markdown.markdown(raw_body)
rendered = _sanitize_html(rendered)
rendered = _remove_unpaired_tags(rendered)
return rendered