fix: disallow "data:" links in discussion posts

Sanitizes Markdown that goes back and forth between the server and client side, to strip out data: links, so that they cannot be abused. There is no present vulnerability to this issue–modern browsers disallow data links in the first place, and we already filter this out in both client-side code as well as the HTML generated in the REST API (it's run through bleach). But we're adding this anyway, to further reduce the odds that some client-side mistake could cause a vulnerability. This is part of TNL-8589.
2021-08-01 00:48:44 -04:00
parent 68e8a6a981
commit 149b014053
4 changed files with 65 additions and 9 deletions
--- a/lms/djangoapps/discussion/django_comment_client/base/views.py
+++ b/lms/djangoapps/discussion/django_comment_client/base/views.py
@@ -39,7 +39,8 @@ from lms.djangoapps.discussion.django_comment_client.utils import (
    get_group_id_for_comments_service,
    get_user_group_ids,
    is_comment_too_deep,
-    prepare_content
+    prepare_content,
+    sanitize_body,
 )
 from openedx.core.djangoapps.django_comment_common.signals import (
    comment_created,
@@ -263,7 +264,7 @@ def create_thread(request, course_id, commentable_id):
        'course_id': str(course_key),
        'user_id': user.id,
        'thread_type': post["thread_type"],
-        'body': post["body"],
+        'body': sanitize_body(post["body"]),
        'title': post["title"],
    }

@@ -326,7 +327,7 @@ def update_thread(request, course_id, thread_id):
    thread = cc.Thread.find(thread_id)
    # Get thread context first in order to be safe from reseting the values of thread object later
    thread_context = getattr(thread, "context", "course")
-    thread.body = request.POST["body"]
+    thread.body = sanitize_body(request.POST["body"])
    thread.title = request.POST["title"]
    user = request.user
    # The following checks should avoid issues we've seen during deploys, where end users are hitting an updated server
@@ -381,7 +382,7 @@ def _create_comment(request, course_key, thread_id=None, parent_id=None):
        course_id=str(course_key),
        thread_id=thread_id,
        parent_id=parent_id,
-        body=post["body"]
+        body=sanitize_body(post["body"]),
    )
    comment.save()

@@ -441,7 +442,7 @@ def update_comment(request, course_id, comment_id):
    comment = cc.Comment.find(comment_id)
    if 'body' not in request.POST or not request.POST['body'].strip():
        return JsonError(_("Body can't be empty"))
-    comment.body = request.POST["body"]
+    comment.body = sanitize_body(request.POST["body"])
    comment.save()

    comment_edited.send(sender=None, user=request.user, post=comment)
--- a/lms/djangoapps/discussion/django_comment_client/tests/test_utils.py
+++ b/lms/djangoapps/discussion/django_comment_client/tests/test_utils.py
@@ -3,6 +3,7 @@

 import datetime
 import json
+import unittest
 from unittest import mock
 from unittest.mock import Mock, patch

@@ -31,7 +32,7 @@ from openedx.core.djangoapps.course_groups.cohorts import set_course_cohorted
 from openedx.core.djangoapps.course_groups.tests.helpers import CohortFactory, config_course_cohorts
 from openedx.core.djangoapps.django_comment_common.comment_client.utils import (
    CommentClientMaintenanceError,
-    perform_request
+    perform_request,
 )
 from openedx.core.djangoapps.django_comment_common.models import (
    CourseDiscussionSettings,
@@ -1745,3 +1746,37 @@ class MiscUtilsTests(TestCase):

        thread_data['course_id'] = CourseKey.from_string(course_id)
        assert utils.permalink(thread_data) == expected_url
+
+
+@ddt.ddt
+class SanitizeTests(unittest.TestCase):
+    """Pure functional tests around sanitizing Markdown"""
+
+    @ddt.data(
+        (None, None),
+        ("", ""),
+        (
+            "No substitutions, even if there's data: ",
+            "No substitutions, even if there's data: ",
+        ),
+        (
+            """
+            [Click here](data:text/html;base64,PHNjcmlwdD5hbGVydCgxKTwvc2NyaXB0Pg==) Some Text
+
+            [This link is fine](https://www.openedx.org)
+
+            More Text [Click here](data:text/html;base64,PHNjcmlwdD5hbGVydCgxKTwvc2NyaXB0P)
+            """,
+            """
+            [Click here]() Some Text
+
+            [This link is fine](https://www.openedx.org)
+
+            More Text [Click here]()
+            """,
+        ),
+    )
+    @ddt.unpack
+    def test_input_output(self, input_str, expected_output):
+        """Test a range of inputs for cleanup."""
+        assert utils.sanitize_body(input_str) == expected_output
--- a/lms/djangoapps/discussion/django_comment_client/utils.py
+++ b/lms/djangoapps/discussion/django_comment_client/utils.py
@@ -1,8 +1,7 @@
 # pylint: skip-file
-
-
 import json
 import logging
+import regex
 from collections import defaultdict
 from datetime import datetime

@@ -773,6 +772,10 @@ def prepare_content(content, course_key, is_staff=False, discussion_division_ena

    content = strip_none(extract(content, fields))

+    # Replace the content body with a sanitized version
+    if 'body' in content:
+        content['body'] = sanitize_body(content['body'])
+
    if content.get("endorsement"):
        endorsement = content["endorsement"]
        endorser = None
@@ -1076,3 +1079,20 @@ def is_content_authored_by(content, user):
        return int(content.get('user_id')) == user.id
    except (ValueError, TypeError):
        return False
+
+
+def sanitize_body(body):
+    """
+    Return a sanitized version of the body with dangerous Markdown removed.
+
+    This is possibly overly broad, and might tamper with legitimate posts that
+    contain this code in fenced code blocks. As far as we can tell, this is an
+    extra layer of protection, and current handling in the front end and using
+    bleach for HTML rendering on the server side should cover these cases.
+    """
+    if not body:
+        return body
+
+    # This will remove the Markdown style links with data: URLs, and turn them
+    # into empty links.
+    return regex.sub(r'\]\(data:[^)]+\)', ']()', body)
--- a/lms/djangoapps/discussion/views.py
+++ b/lms/djangoapps/discussion/views.py
@@ -45,7 +45,7 @@ from lms.djangoapps.discussion.django_comment_client.utils import (
    get_group_id_for_user,
    get_group_names_by_id,
    is_commentable_divided,
-    strip_none
+    strip_none,
 )
 from lms.djangoapps.discussion.exceptions import TeamDiscussionHiddenFromUserException
 from lms.djangoapps.experiments.utils import get_experiment_user_metadata_context