diff --git a/lms/djangoapps/discussion_api/render.py b/lms/djangoapps/discussion_api/render.py new file mode 100644 index 0000000000..20f538df45 --- /dev/null +++ b/lms/djangoapps/discussion_api/render.py @@ -0,0 +1,105 @@ +""" +Content rendering functionality + +Note that this module is designed to imitate the front end behavior as +implemented in Markdown.Sanitizer.js. +""" +import re + +import markdown + +# These patterns could be more flexible about things like attributes and +# whitespace, but this is imitating Markdown.Sanitizer.js, so it uses the +# patterns defined therein. +TAG_PATTERN = re.compile(r"<[^>]*>?") +SANITIZED_TAG_PATTERN = re.compile(r"<(/?)(\w+)[^>]*>") +ALLOWED_BASIC_TAG_PATTERN = re.compile( + r"^(|<(br|hr)\s?/?>)$" +) +ALLOWED_A_PATTERN = re.compile( + r'^(]+")?\s?>|)$' +) +ALLOWED_IMG_PATTERN = re.compile( + r'^(]*")?(\stitle="[^"<>]*")?\s?/?>)$' +) + + +def _sanitize_tag(match): + """Return the tag if it is allowed or the empty string otherwise""" + tag = match.group(0) + if ( + ALLOWED_BASIC_TAG_PATTERN.match(tag) or + ALLOWED_A_PATTERN.match(tag) or + ALLOWED_IMG_PATTERN.match(tag) + ): + return tag + else: + return "" + + +def _sanitize_html(source): + """ + Return source with all non-allowed tags removed, preserving the text content + """ + return TAG_PATTERN.sub(_sanitize_tag, source) + + +def _remove_unpaired_tags(source): + """ + Return source with all unpaired tags removed, preserving the text content + + source should have already been sanitized + """ + tag_matches = list(SANITIZED_TAG_PATTERN.finditer(source)) + if not tag_matches: + return source + tag_stack = [] + tag_name_stack = [] + text_stack = [source[:tag_matches[0].start()]] + for i, match in enumerate(tag_matches): + tag_name = match.group(2) + following_text = ( + source[match.end():tag_matches[i + 1].start()] if i + 1 < len(tag_matches) else + source[match.end():] + ) + if tag_name in ["p", "img", "br", "li", "hr"]: # tags that don't require closing + text_stack[-1] += match.group(0) + following_text + elif match.group(1): # end tag + if tag_name in tag_name_stack: # paired with a start tag somewhere + # pop tags until we find the matching one, keeping the non-tag text + while True: + popped_tag_name = tag_name_stack.pop() + popped_tag = tag_stack.pop() + popped_text = text_stack.pop() + if popped_tag_name == tag_name: + text_stack[-1] += popped_tag + popped_text + match.group(0) + break + else: + text_stack[-1] += popped_text + # else unpaired; drop the tag + text_stack[-1] += following_text + else: # start tag + tag_stack.append(match.group(0)) + tag_name_stack.append(tag_name) + text_stack.append(following_text) + return "".join(text_stack) + + +def render_body(raw_body): + """ + Render raw_body to HTML. + + This includes the following steps: + + * Convert Markdown to HTML + * Strip non-whitelisted HTML + * Remove unbalanced HTML tags + + Note that this does not prevent Markdown syntax inside a MathJax block from + being processed, which the forums JavaScript code does. + """ + rendered = markdown.markdown(raw_body) + rendered = _sanitize_html(rendered) + rendered = _remove_unpaired_tags(rendered) + return rendered diff --git a/lms/djangoapps/discussion_api/serializers.py b/lms/djangoapps/discussion_api/serializers.py index a875fc59f5..290ec49b32 100644 --- a/lms/djangoapps/discussion_api/serializers.py +++ b/lms/djangoapps/discussion_api/serializers.py @@ -10,6 +10,7 @@ from django.core.urlresolvers import reverse from rest_framework import serializers +from discussion_api.render import render_body from django_comment_common.models import ( FORUM_ROLE_ADMINISTRATOR, FORUM_ROLE_COMMUNITY_TA, @@ -65,6 +66,7 @@ class _ContentSerializer(serializers.Serializer): created_at = serializers.CharField(read_only=True) updated_at = serializers.CharField(read_only=True) raw_body = NonEmptyCharField(source="body") + rendered_body = serializers.SerializerMethodField("get_rendered_body") abuse_flagged = serializers.SerializerMethodField("get_abuse_flagged") voted = serializers.SerializerMethodField("get_voted") vote_count = serializers.SerializerMethodField("get_vote_count") @@ -122,6 +124,10 @@ class _ContentSerializer(serializers.Serializer): """Returns the role label for the content author.""" return None if self._is_anonymous(obj) else self._get_user_label(int(obj["user_id"])) + def get_rendered_body(self, obj): + """Returns the rendered body content.""" + return render_body(obj["body"]) + def get_abuse_flagged(self, obj): """ Returns a boolean indicating whether the requester has flagged the diff --git a/lms/djangoapps/discussion_api/tests/test_api.py b/lms/djangoapps/discussion_api/tests/test_api.py index 51104ce04e..4afb375b5d 100644 --- a/lms/djangoapps/discussion_api/tests/test_api.py +++ b/lms/djangoapps/discussion_api/tests/test_api.py @@ -597,6 +597,7 @@ class GetThreadListTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest "type": "discussion", "title": "Test Title", "raw_body": "Test body", + "rendered_body": "

Test body

", "pinned": False, "closed": False, "following": False, @@ -622,6 +623,7 @@ class GetThreadListTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest "type": "question", "title": "Another Test Title", "raw_body": "More content", + "rendered_body": "

More content

", "pinned": False, "closed": True, "following": False, @@ -953,6 +955,7 @@ class GetCommentListTest(CommentsServiceMockMixin, ModuleStoreTestCase): "created_at": "2015-05-11T00:00:00Z", "updated_at": "2015-05-11T11:11:11Z", "raw_body": "Test body", + "rendered_body": "

Test body

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None, @@ -971,6 +974,7 @@ class GetCommentListTest(CommentsServiceMockMixin, ModuleStoreTestCase): "created_at": "2015-05-11T22:22:22Z", "updated_at": "2015-05-11T33:33:33Z", "raw_body": "More content", + "rendered_body": "

More content

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None, @@ -1186,6 +1190,7 @@ class CreateThreadTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTestC "type": "discussion", "title": "Test Title", "raw_body": "Test body", + "rendered_body": "

Test body

", "pinned": False, "closed": False, "following": False, @@ -1353,6 +1358,7 @@ class CreateCommentTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest "created_at": "2015-05-27T00:00:00Z", "updated_at": "2015-05-27T00:00:00Z", "raw_body": "Test body", + "rendered_body": "

Test body

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None, @@ -1562,6 +1568,7 @@ class UpdateThreadTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTestC "type": "discussion", "title": "Original Title", "raw_body": "Edited body", + "rendered_body": "

Edited body

", "pinned": False, "closed": False, "following": False, @@ -1825,6 +1832,7 @@ class UpdateCommentTest(CommentsServiceMockMixin, UrlResetMixin, ModuleStoreTest "created_at": "2015-06-03T00:00:00Z", "updated_at": "2015-06-03T00:00:00Z", "raw_body": "Edited body", + "rendered_body": "

Edited body

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None, diff --git a/lms/djangoapps/discussion_api/tests/test_render.py b/lms/djangoapps/discussion_api/tests/test_render.py new file mode 100644 index 0000000000..0faa24ad53 --- /dev/null +++ b/lms/djangoapps/discussion_api/tests/test_render.py @@ -0,0 +1,90 @@ +""" +Tests for content rendering +""" +from unittest import TestCase + +import ddt + +from discussion_api.render import render_body + + +def _add_p_tags(raw_body): + """Return raw_body surrounded by p tags""" + return "

{raw_body}

".format(raw_body=raw_body) + + +@ddt.ddt +class RenderBodyTest(TestCase): + """Tests for render_body""" + def test_empty(self): + self.assertEqual(render_body(""), "") + + @ddt.data( + ("*", "em"), + ("**", "strong"), + ("`", "code"), + ) + @ddt.unpack + def test_markdown_inline(self, delimiter, tag): + self.assertEqual( + render_body("{delimiter}some text{delimiter}".format(delimiter=delimiter)), + "

<{tag}>some text

".format(tag=tag) + ) + + @ddt.data( + "b", "blockquote", "code", "del", "dd", "dl", "dt", "em", "h1", "h2", "h3", "i", "kbd", + "li", "ol", "p", "pre", "s", "sup", "sub", "strong", "strike", "ul" + ) + def test_openclose_tag(self, tag): + raw_body = "<{tag}>some text".format(tag=tag) + is_inline_tag = tag in ["b", "code", "del", "em", "i", "kbd", "s", "sup", "sub", "strong", "strike"] + rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body + self.assertEqual(render_body(raw_body), rendered_body) + + @ddt.data("br", "hr") + def test_selfclosing_tag(self, tag): + raw_body = "<{tag}>".format(tag=tag) + is_inline_tag = tag == "br" + rendered_body = _add_p_tags(raw_body) if is_inline_tag else raw_body + self.assertEqual(render_body(raw_body), rendered_body) + + @ddt.data("http", "https", "ftp") + def test_allowed_a_tag(self, protocol): + raw_body = 'baz'.format(protocol=protocol) + self.assertEqual(render_body(raw_body), _add_p_tags(raw_body)) + + def test_disallowed_a_tag(self): + raw_body = 'link content' + self.assertEqual(render_body(raw_body), "

link content

") + + @ddt.data("http", "https") + def test_allowed_img_tag(self, protocol): + raw_body = 'bar'.format( + protocol=protocol + ) + self.assertEqual(render_body(raw_body), _add_p_tags(raw_body)) + + def test_disallowed_img_tag(self): + raw_body = '' + self.assertEqual(render_body(raw_body), "

") + + def test_script_tag(self): + raw_body = '' + self.assertEqual(render_body(raw_body), 'alert("evil script");') + + @ddt.data("p", "br", "li", "hr") # img is tested above + def test_allowed_unpaired_tags(self, tag): + raw_body = "foo<{tag}>bar".format(tag=tag) + self.assertEqual(render_body(raw_body), _add_p_tags(raw_body)) + + def test_unpaired_start_tag(self): + self.assertEqual(render_body("foobar"), "

foobar

") + + def test_unpaired_end_tag(self): + self.assertEqual(render_body("foo
bar"), "

foobar

") + + def test_interleaved_tags(self): + self.assertEqual( + render_body("foobarbazquuxgreg"), + "

foobarbazquuxgreg

" + ) diff --git a/lms/djangoapps/discussion_api/tests/test_serializers.py b/lms/djangoapps/discussion_api/tests/test_serializers.py index 3d704b405e..813119d8f6 100644 --- a/lms/djangoapps/discussion_api/tests/test_serializers.py +++ b/lms/djangoapps/discussion_api/tests/test_serializers.py @@ -184,6 +184,7 @@ class ThreadSerializerSerializationTest(SerializerTestMixin, ModuleStoreTestCase "type": "discussion", "title": "Test Title", "raw_body": "Test body", + "rendered_body": "

Test body

", "pinned": True, "closed": False, "following": False, @@ -281,6 +282,7 @@ class CommentSerializerTest(SerializerTestMixin, ModuleStoreTestCase): "created_at": "2015-04-28T00:00:00Z", "updated_at": "2015-04-28T11:11:11Z", "raw_body": "Test body", + "rendered_body": "

Test body

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None, diff --git a/lms/djangoapps/discussion_api/tests/test_views.py b/lms/djangoapps/discussion_api/tests/test_views.py index bd817ecb0b..8bdaa6f2a6 100644 --- a/lms/djangoapps/discussion_api/tests/test_views.py +++ b/lms/djangoapps/discussion_api/tests/test_views.py @@ -191,6 +191,7 @@ class ThreadViewSetListTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase): "type": "discussion", "title": "Test Title", "raw_body": "Test body", + "rendered_body": "

Test body

", "pinned": False, "closed": False, "following": False, @@ -303,6 +304,7 @@ class ThreadViewSetCreateTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase): "type": "discussion", "title": "Test Title", "raw_body": "Test body", + "rendered_body": "

Test body

", "pinned": False, "closed": False, "following": False, @@ -392,6 +394,7 @@ class ThreadViewSetPartialUpdateTest(DiscussionAPIViewTestMixin, ModuleStoreTest "type": "discussion", "title": "Original Title", "raw_body": "Edited body", + "rendered_body": "

Edited body

", "pinned": False, "closed": False, "following": False, @@ -536,6 +539,7 @@ class CommentViewSetListTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase): "created_at": "2015-05-11T00:00:00Z", "updated_at": "2015-05-11T11:11:11Z", "raw_body": "Test body", + "rendered_body": "

Test body

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None, @@ -688,6 +692,7 @@ class CommentViewSetCreateTest(DiscussionAPIViewTestMixin, ModuleStoreTestCase): "created_at": "2015-05-27T00:00:00Z", "updated_at": "2015-05-27T00:00:00Z", "raw_body": "Test body", + "rendered_body": "

Test body

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None, @@ -770,6 +775,7 @@ class CommentViewSetPartialUpdateTest(DiscussionAPIViewTestMixin, ModuleStoreTes "created_at": "2015-06-03T00:00:00Z", "updated_at": "2015-06-03T00:00:00Z", "raw_body": "Edited body", + "rendered_body": "

Edited body

", "endorsed": False, "endorsed_by": None, "endorsed_by_label": None,