The rendering in this commit does not prevent MathJax content from being rendered as the discussion JavaScript code does.
106 lines
3.6 KiB
Python
106 lines
3.6 KiB
Python
"""
|
|
Content rendering functionality
|
|
|
|
Note that this module is designed to imitate the front end behavior as
|
|
implemented in Markdown.Sanitizer.js.
|
|
"""
|
|
import re
|
|
|
|
import markdown
|
|
|
|
# These patterns could be more flexible about things like attributes and
|
|
# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the
|
|
# patterns defined therein.
|
|
TAG_PATTERN = re.compile(r"<[^>]*>?")
|
|
SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>")
|
|
ALLOWED_BASIC_TAG_PATTERN = re.compile(
|
|
r"^(</?(b|blockquote|code|del|dd|dl|dt|em|h1|h2|h3|i|kbd|li|ol|p|pre|s|sup|sub|strong|strike|ul)>|<(br|hr)\s?/?>)$"
|
|
)
|
|
ALLOWED_A_PATTERN = re.compile(
|
|
r'^(<a\shref="((https?|ftp)://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\stitle="[^"<>]+")?\s?>|</a>)$'
|
|
)
|
|
ALLOWED_IMG_PATTERN = re.compile(
|
|
r'^(<img\ssrc="(https?://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\swidth="\d{1,3}")?'
|
|
r'(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?/?>)$'
|
|
)
|
|
|
|
|
|
def _sanitize_tag(match):
|
|
"""Return the tag if it is allowed or the empty string otherwise"""
|
|
tag = match.group(0)
|
|
if (
|
|
ALLOWED_BASIC_TAG_PATTERN.match(tag) or
|
|
ALLOWED_A_PATTERN.match(tag) or
|
|
ALLOWED_IMG_PATTERN.match(tag)
|
|
):
|
|
return tag
|
|
else:
|
|
return ""
|
|
|
|
|
|
def _sanitize_html(source):
|
|
"""
|
|
Return source with all non-allowed tags removed, preserving the text content
|
|
"""
|
|
return TAG_PATTERN.sub(_sanitize_tag, source)
|
|
|
|
|
|
def _remove_unpaired_tags(source):
|
|
"""
|
|
Return source with all unpaired tags removed, preserving the text content
|
|
|
|
source should have already been sanitized
|
|
"""
|
|
tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source))
|
|
if not tag_matches:
|
|
return source
|
|
tag_stack = []
|
|
tag_name_stack = []
|
|
text_stack = [source[:tag_matches[0].start()]]
|
|
for i, match in enumerate(tag_matches):
|
|
tag_name = match.group(2)
|
|
following_text = (
|
|
source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else
|
|
source[match.end():]
|
|
)
|
|
if tag_name in ["p", "img", "br", "li", "hr"]: # tags that don't require closing
|
|
text_stack[-1] += match.group(0) + following_text
|
|
elif match.group(1): # end tag
|
|
if tag_name in tag_name_stack: # paired with a start tag somewhere
|
|
# pop tags until we find the matching one, keeping the non-tag text
|
|
while True:
|
|
popped_tag_name = tag_name_stack.pop()
|
|
popped_tag = tag_stack.pop()
|
|
popped_text = text_stack.pop()
|
|
if popped_tag_name == tag_name:
|
|
text_stack[-1] += popped_tag + popped_text + match.group(0)
|
|
break
|
|
else:
|
|
text_stack[-1] += popped_text
|
|
# else unpaired; drop the tag
|
|
text_stack[-1] += following_text
|
|
else: # start tag
|
|
tag_stack.append(match.group(0))
|
|
tag_name_stack.append(tag_name)
|
|
text_stack.append(following_text)
|
|
return "".join(text_stack)
|
|
|
|
|
|
def render_body(raw_body):
|
|
"""
|
|
Render raw_body to HTML.
|
|
|
|
This includes the following steps:
|
|
|
|
* Convert Markdown to HTML
|
|
* Strip non-whitelisted HTML
|
|
* Remove unbalanced HTML tags
|
|
|
|
Note that this does not prevent Markdown syntax inside a MathJax block from
|
|
being processed, which the forums JavaScript code does.
|
|
"""
|
|
rendered = markdown.markdown(raw_body)
|
|
rendered = _sanitize_html(rendered)
|
|
rendered = _remove_unpaired_tags(rendered)
|
|
return rendered
|