From 149b0140539ebd574ebc74d9fa4d22c9fb092604 Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Sun, 1 Aug 2021 00:48:44 -0400 Subject: [PATCH] fix: disallow "data:" links in discussion posts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sanitizes Markdown that goes back and forth between the server and client side, to strip out data: links, so that they cannot be abused. There is no present vulnerability to this issue–modern browsers disallow data links in the first place, and we already filter this out in both client-side code as well as the HTML generated in the REST API (it's run through bleach). But we're adding this anyway, to further reduce the odds that some client-side mistake could cause a vulnerability. This is part of TNL-8589. --- .../django_comment_client/base/views.py | 11 +++--- .../django_comment_client/tests/test_utils.py | 37 ++++++++++++++++++- .../discussion/django_comment_client/utils.py | 24 +++++++++++- lms/djangoapps/discussion/views.py | 2 +- 4 files changed, 65 insertions(+), 9 deletions(-) diff --git a/lms/djangoapps/discussion/django_comment_client/base/views.py b/lms/djangoapps/discussion/django_comment_client/base/views.py index 9422238f8e..509724b0a3 100644 --- a/lms/djangoapps/discussion/django_comment_client/base/views.py +++ b/lms/djangoapps/discussion/django_comment_client/base/views.py @@ -39,7 +39,8 @@ from lms.djangoapps.discussion.django_comment_client.utils import ( get_group_id_for_comments_service, get_user_group_ids, is_comment_too_deep, - prepare_content + prepare_content, + sanitize_body, ) from openedx.core.djangoapps.django_comment_common.signals import ( comment_created, @@ -263,7 +264,7 @@ def create_thread(request, course_id, commentable_id): 'course_id': str(course_key), 'user_id': user.id, 'thread_type': post["thread_type"], - 'body': post["body"], + 'body': sanitize_body(post["body"]), 'title': post["title"], } @@ -326,7 +327,7 @@ def update_thread(request, course_id, thread_id): thread = cc.Thread.find(thread_id) # Get thread context first in order to be safe from reseting the values of thread object later thread_context = getattr(thread, "context", "course") - thread.body = request.POST["body"] + thread.body = sanitize_body(request.POST["body"]) thread.title = request.POST["title"] user = request.user # The following checks should avoid issues we've seen during deploys, where end users are hitting an updated server @@ -381,7 +382,7 @@ def _create_comment(request, course_key, thread_id=None, parent_id=None): course_id=str(course_key), thread_id=thread_id, parent_id=parent_id, - body=post["body"] + body=sanitize_body(post["body"]), ) comment.save() @@ -441,7 +442,7 @@ def update_comment(request, course_id, comment_id): comment = cc.Comment.find(comment_id) if 'body' not in request.POST or not request.POST['body'].strip(): return JsonError(_("Body can't be empty")) - comment.body = request.POST["body"] + comment.body = sanitize_body(request.POST["body"]) comment.save() comment_edited.send(sender=None, user=request.user, post=comment) diff --git a/lms/djangoapps/discussion/django_comment_client/tests/test_utils.py b/lms/djangoapps/discussion/django_comment_client/tests/test_utils.py index 8cb2b3d9db..31441b7fe0 100644 --- a/lms/djangoapps/discussion/django_comment_client/tests/test_utils.py +++ b/lms/djangoapps/discussion/django_comment_client/tests/test_utils.py @@ -3,6 +3,7 @@ import datetime import json +import unittest from unittest import mock from unittest.mock import Mock, patch @@ -31,7 +32,7 @@ from openedx.core.djangoapps.course_groups.cohorts import set_course_cohorted from openedx.core.djangoapps.course_groups.tests.helpers import CohortFactory, config_course_cohorts from openedx.core.djangoapps.django_comment_common.comment_client.utils import ( CommentClientMaintenanceError, - perform_request + perform_request, ) from openedx.core.djangoapps.django_comment_common.models import ( CourseDiscussionSettings, @@ -1745,3 +1746,37 @@ class MiscUtilsTests(TestCase): thread_data['course_id'] = CourseKey.from_string(course_id) assert utils.permalink(thread_data) == expected_url + + +@ddt.ddt +class SanitizeTests(unittest.TestCase): + """Pure functional tests around sanitizing Markdown""" + + @ddt.data( + (None, None), + ("", ""), + ( + "No substitutions, even if there's data: ", + "No substitutions, even if there's data: ", + ), + ( + """ + [Click here](data:text/html;base64,PHNjcmlwdD5hbGVydCgxKTwvc2NyaXB0Pg==) Some Text + + [This link is fine](https://www.openedx.org) + + More Text [Click here](data:text/html;base64,PHNjcmlwdD5hbGVydCgxKTwvc2NyaXB0P) + """, + """ + [Click here]() Some Text + + [This link is fine](https://www.openedx.org) + + More Text [Click here]() + """, + ), + ) + @ddt.unpack + def test_input_output(self, input_str, expected_output): + """Test a range of inputs for cleanup.""" + assert utils.sanitize_body(input_str) == expected_output diff --git a/lms/djangoapps/discussion/django_comment_client/utils.py b/lms/djangoapps/discussion/django_comment_client/utils.py index 8207c206e3..ee5c6cc129 100644 --- a/lms/djangoapps/discussion/django_comment_client/utils.py +++ b/lms/djangoapps/discussion/django_comment_client/utils.py @@ -1,8 +1,7 @@ # pylint: skip-file - - import json import logging +import regex from collections import defaultdict from datetime import datetime @@ -773,6 +772,10 @@ def prepare_content(content, course_key, is_staff=False, discussion_division_ena content = strip_none(extract(content, fields)) + # Replace the content body with a sanitized version + if 'body' in content: + content['body'] = sanitize_body(content['body']) + if content.get("endorsement"): endorsement = content["endorsement"] endorser = None @@ -1076,3 +1079,20 @@ def is_content_authored_by(content, user): return int(content.get('user_id')) == user.id except (ValueError, TypeError): return False + + +def sanitize_body(body): + """ + Return a sanitized version of the body with dangerous Markdown removed. + + This is possibly overly broad, and might tamper with legitimate posts that + contain this code in fenced code blocks. As far as we can tell, this is an + extra layer of protection, and current handling in the front end and using + bleach for HTML rendering on the server side should cover these cases. + """ + if not body: + return body + + # This will remove the Markdown style links with data: URLs, and turn them + # into empty links. + return regex.sub(r'\]\(data:[^)]+\)', ']()', body) diff --git a/lms/djangoapps/discussion/views.py b/lms/djangoapps/discussion/views.py index 85d357e487..0bf58a2b3a 100644 --- a/lms/djangoapps/discussion/views.py +++ b/lms/djangoapps/discussion/views.py @@ -45,7 +45,7 @@ from lms.djangoapps.discussion.django_comment_client.utils import ( get_group_id_for_user, get_group_names_by_id, is_commentable_divided, - strip_none + strip_none, ) from lms.djangoapps.discussion.exceptions import TeamDiscussionHiddenFromUserException from lms.djangoapps.experiments.utils import get_experiment_user_metadata_context