Merge pull request #8541 from edx/gprice/discussion-api-rendered-body

Add rendered_body to discussion API endpoints
This commit is contained in:
Greg Price
2015-06-17 22:51:38 -04:00
6 changed files with 217 additions and 0 deletions

View File

@@ -0,0 +1,105 @@
"""
Content rendering functionality
Note that this module is designed to imitate the front end behavior as
implemented in Markdown.Sanitizer.js.
"""
import re
import markdown
# These patterns could be more flexible about things like attributes and
# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the
# patterns defined therein.
TAG_PATTERN = re.compile(r"<[^>]*>?")
SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>")
ALLOWED_BASIC_TAG_PATTERN = re.compile(
r"^(</?(b|blockquote|code|del|dd|dl|dt|em|h1|h2|h3|i|kbd|li|ol|p|pre|s|sup|sub|strong|strike|ul)>|<(br|hr)\s?/?>)$"
)
ALLOWED_A_PATTERN = re.compile(
r'^(<a\shref="((https?|ftp)://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\stitle="[^"<>]+")?\s?>|</a>)$'
)
ALLOWED_IMG_PATTERN = re.compile(
r'^(<img\ssrc="(https?://|/)[-A-Za-z0-9+&@#/%?=~_|!:,.;\(\)]+"(\swidth="\d{1,3}")?'
r'(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?/?>)$'
)
def _sanitize_tag(match):
"""Return the tag if it is allowed or the empty string otherwise"""
tag = match.group(0)
if (
ALLOWED_BASIC_TAG_PATTERN.match(tag) or
ALLOWED_A_PATTERN.match(tag) or
ALLOWED_IMG_PATTERN.match(tag)
):
return tag
else:
return ""
def _sanitize_html(source):
"""
Return source with all non-allowed tags removed, preserving the text content
"""
return TAG_PATTERN.sub(_sanitize_tag, source)
def _remove_unpaired_tags(source):
"""
Return source with all unpaired tags removed, preserving the text content
source should have already been sanitized
"""
tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source))
if not tag_matches:
return source
tag_stack = []
tag_name_stack = []
text_stack = [source[:tag_matches[0].start()]]
for i, match in enumerate(tag_matches):
tag_name = match.group(2)
following_text = (
source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else
source[match.end():]
)
if tag_name in ["p", "img", "br", "li", "hr"]: # tags that don't require closing
text_stack[-1] += match.group(0) + following_text
elif match.group(1): # end tag
if tag_name in tag_name_stack: # paired with a start tag somewhere
# pop tags until we find the matching one, keeping the non-tag text
while True:
popped_tag_name = tag_name_stack.pop()
popped_tag = tag_stack.pop()
popped_text = text_stack.pop()
if popped_tag_name == tag_name:
text_stack[-1] += popped_tag + popped_text + match.group(0)
break
else:
text_stack[-1] += popped_text
# else unpaired; drop the tag
text_stack[-1] += following_text
else: # start tag
tag_stack.append(match.group(0))
tag_name_stack.append(tag_name)
text_stack.append(following_text)
return "".join(text_stack)
def render_body(raw_body):
"""
Render raw_body to HTML.
This includes the following steps:
* Convert Markdown to HTML
* Strip non-whitelisted HTML
* Remove unbalanced HTML tags
Note that this does not prevent Markdown syntax inside a MathJax block from
being processed, which the forums JavaScript code does.
"""
rendered = markdown.markdown(raw_body)
rendered = _sanitize_html(rendered)
rendered = _remove_unpaired_tags(rendered)
return rendered

View File

@@ -10,6 +10,7 @@ from django.core.urlresolvers import reverse
from rest_framework import serializers
from discussion_api.render import render_body
from django_comment_common.models import (
FORUM_ROLE_ADMINISTRATOR,
FORUM_ROLE_COMMUNITY_TA,
@@ -65,6 +66,7 @@ class _ContentSerializer(serializers.Serializer):
created_at = serializers.CharField(read_only=True)
updated_at = serializers.CharField(read_only=True)
raw_body = NonEmptyCharField(source="body")
rendered_body = serializers.SerializerMethodField("get_rendered_body")
abuse_flagged = serializers.SerializerMethodField("get_abuse_flagged")
voted = serializers.SerializerMethodField("get_voted")
vote_count = serializers.SerializerMethodField("get_vote_count")
@@ -122,6 +124,10 @@ class _ContentSerializer(serializers.Serializer):
"""Returns the role label for the content author."""
return None if self._is_anonymous(obj) else self._get_user_label(int(obj["user_id"]))
def get_rendered_body(self, obj):
"""Returns the rendered body content."""
return render_body(obj["body"])
def get_abuse_flagged(self, obj):
"""
Returns a boolean indicating whether the requester has flagged the

View File

@@ -597,6 +597,7 @@ class GetThreadListTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest
"type": "discussion",
"title": "Test Title",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"pinned": False,
"closed": False,
"following": False,
@@ -622,6 +623,7 @@ class GetThreadListTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest
"type": "question",
"title": "Another Test Title",
"raw_body": "More content",
"rendered_body": "<p>More content</p>",
"pinned": False,
"closed": True,
"following": False,
@@ -953,6 +955,7 @@ class GetCommentListTest(CommentsServiceMockMixin, ModuleStoreTestCase):
"created_at": "2015-05-11T00:00:00Z",
"updated_at": "2015-05-11T11:11:11Z",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,
@@ -971,6 +974,7 @@ class GetCommentListTest(CommentsServiceMockMixin, ModuleStoreTestCase):
"created_at": "2015-05-11T22:22:22Z",
"updated_at": "2015-05-11T33:33:33Z",
"raw_body": "More content",
"rendered_body": "<p>More content</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,
@@ -1186,6 +1190,7 @@ class CreateThreadTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTestC
"type": "discussion",
"title": "Test Title",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"pinned": False,
"closed": False,
"following": False,
@@ -1353,6 +1358,7 @@ class CreateCommentTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest
"created_at": "2015-05-27T00:00:00Z",
"updated_at": "2015-05-27T00:00:00Z",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,
@@ -1562,6 +1568,7 @@ class UpdateThreadTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTestC
"type": "discussion",
"title": "Original Title",
"raw_body": "Edited body",
"rendered_body": "<p>Edited body</p>",
"pinned": False,
"closed": False,
"following": False,
@@ -1825,6 +1832,7 @@ class UpdateCommentTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest
"created_at": "2015-06-03T00:00:00Z",
"updated_at": "2015-06-03T00:00:00Z",
"raw_body": "Edited body",
"rendered_body": "<p>Edited body</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,

View File

@@ -0,0 +1,90 @@
"""
Tests for content rendering
"""
from unittest import TestCase
import ddt
from discussion_api.render import render_body
def _add_p_tags(raw_body):
"""Return raw_body surrounded by p tags"""
return "<p>{raw_body}</p>".format(raw_body=raw_body)
@ddt.ddt
class RenderBodyTest(TestCase):
"""Tests for render_body"""
def test_empty(self):
self.assertEqual(render_body(""), "")
@ddt.data(
("*", "em"),
("**", "strong"),
("`", "code"),
)
@ddt.unpack
def test_markdown_inline(self, delimiter, tag):
self.assertEqual(
render_body("{delimiter}some text{delimiter}".format(delimiter=delimiter)),
"<p><{tag}>some text</{tag}></p>".format(tag=tag)
)
@ddt.data(
"b", "blockquote", "code", "del", "dd", "dl", "dt", "em", "h1", "h2", "h3", "i", "kbd",
"li", "ol", "p", "pre", "s", "sup", "sub", "strong", "strike", "ul"
)
def test_openclose_tag(self, tag):
raw_body = "<{tag}>some text</{tag}>".format(tag=tag)
is_inline_tag = tag in ["b", "code", "del", "em", "i", "kbd", "s", "sup", "sub", "strong", "strike"]
rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body
self.assertEqual(render_body(raw_body), rendered_body)
@ddt.data("br", "hr")
def test_selfclosing_tag(self, tag):
raw_body = "<{tag}>".format(tag=tag)
is_inline_tag = tag == "br"
rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body
self.assertEqual(render_body(raw_body), rendered_body)
@ddt.data("http", "https", "ftp")
def test_allowed_a_tag(self, protocol):
raw_body = '<a href="{protocol}://foo" title="bar">baz</a>'.format(protocol=protocol)
self.assertEqual(render_body(raw_body), _add_p_tags(raw_body))
def test_disallowed_a_tag(self):
raw_body = '<a href="gopher://foo">link content</a>'
self.assertEqual(render_body(raw_body), "<p>link content</p>")
@ddt.data("http", "https")
def test_allowed_img_tag(self, protocol):
raw_body = '<img src="{protocol}://foo" width="111" height="222" alt="bar" title="baz">'.format(
protocol=protocol
)
self.assertEqual(render_body(raw_body), _add_p_tags(raw_body))
def test_disallowed_img_tag(self):
raw_body = '<img src="gopher://foo">'
self.assertEqual(render_body(raw_body), "<p></p>")
def test_script_tag(self):
raw_body = '<script type="text/javascript">alert("evil script");</script>'
self.assertEqual(render_body(raw_body), 'alert("evil script");')
@ddt.data("p", "br", "li", "hr") # img is tested above
def test_allowed_unpaired_tags(self, tag):
raw_body = "foo<{tag}>bar".format(tag=tag)
self.assertEqual(render_body(raw_body), _add_p_tags(raw_body))
def test_unpaired_start_tag(self):
self.assertEqual(render_body("foo<i>bar"), "<p>foobar</p>")
def test_unpaired_end_tag(self):
self.assertEqual(render_body("foo</i>bar"), "<p>foobar</p>")
def test_interleaved_tags(self):
self.assertEqual(
render_body("foo<i>bar<b>baz</i>quux</b>greg"),
"<p>foo<i>barbaz</i>quuxgreg</p>"
)

View File

@@ -184,6 +184,7 @@ class ThreadSerializerSerializationTest(SerializerTestMixin, ModuleStoreTestCase
"type": "discussion",
"title": "Test Title",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"pinned": True,
"closed": False,
"following": False,
@@ -281,6 +282,7 @@ class CommentSerializerTest(SerializerTestMixin, ModuleStoreTestCase):
"created_at": "2015-04-28T00:00:00Z",
"updated_at": "2015-04-28T11:11:11Z",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,

View File

@@ -191,6 +191,7 @@ class ThreadViewSetListTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase):
"type": "discussion",
"title": "Test Title",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"pinned": False,
"closed": False,
"following": False,
@@ -303,6 +304,7 @@ class ThreadViewSetCreateTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase):
"type": "discussion",
"title": "Test Title",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"pinned": False,
"closed": False,
"following": False,
@@ -392,6 +394,7 @@ class ThreadViewSetPartialUpdateTest(DiscussionAPIViewTestMixin, ModuleStoreTest
"type": "discussion",
"title": "Original Title",
"raw_body": "Edited body",
"rendered_body": "<p>Edited body</p>",
"pinned": False,
"closed": False,
"following": False,
@@ -536,6 +539,7 @@ class CommentViewSetListTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase):
"created_at": "2015-05-11T00:00:00Z",
"updated_at": "2015-05-11T11:11:11Z",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,
@@ -688,6 +692,7 @@ class CommentViewSetCreateTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase):
"created_at": "2015-05-27T00:00:00Z",
"updated_at": "2015-05-27T00:00:00Z",
"raw_body": "Test body",
"rendered_body": "<p>Test body</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,
@@ -770,6 +775,7 @@ class CommentViewSetPartialUpdateTest(DiscussionAPIViewTestMixin, ModuleStoreTes
"created_at": "2015-06-03T00:00:00Z",
"updated_at": "2015-06-03T00:00:00Z",
"raw_body": "Edited body",
"rendered_body": "<p>Edited body</p>",
"endorsed": False,
"endorsed_by": None,
"endorsed_by_label": None,