chore: replace bleach with nh3

2024-05-28 13:02:16 +05:00
parent 63e940d65d
commit f7229e0aad
25 changed files with 92 additions and 62 deletions
--- a/lms/djangoapps/certificates/views/support.py
+++ b/lms/djangoapps/certificates/views/support.py
@@ -10,7 +10,7 @@ import logging
 import urllib
 from functools import wraps

-import bleach
+import nh3
 from django.db import transaction
 from django.db.models import Q
 from django.http import HttpResponse, HttpResponseBadRequest, HttpResponseForbidden, HttpResponseServerError
@@ -88,8 +88,8 @@ def search_certificates(request):
        ]

    """
-    unbleached_filter = urllib.parse.unquote(urllib.parse.quote_plus(request.GET.get("user", "")))
-    user_filter = bleach.clean(unbleached_filter)
+    uncleaned_filter = urllib.parse.unquote(urllib.parse.quote_plus(request.GET.get("user", "")))
+    user_filter = nh3.clean(uncleaned_filter)
    if not user_filter:
        msg = _("user is not given.")
        return HttpResponseBadRequest(msg)
--- a/lms/djangoapps/courseware/views/views.py
+++ b/lms/djangoapps/courseware/views/views.py
@@ -10,7 +10,7 @@ from collections import OrderedDict, namedtuple
 from datetime import datetime
 from urllib.parse import quote_plus, urlencode, urljoin, urlparse, urlunparse

-import bleach
+import nh3
 import requests
 from django.conf import settings
 from django.contrib.auth.decorators import login_required
@@ -1550,7 +1550,7 @@ def render_xblock(request, usage_key_string, check_if_enrolled=True, disable_sta
    requested_view = request.GET.get('view', 'student_view')
    if requested_view != 'student_view' and requested_view != 'public_view':  # lint-amnesty, pylint: disable=consider-using-in
        return HttpResponseBadRequest(
-            f"Rendering of the xblock view '{bleach.clean(requested_view, strip=True)}' is not supported."
+            f"Rendering of the xblock view '{nh3.clean(requested_view)}' is not supported."
        )

    staff_access = has_access(request.user, 'staff', course_key)
--- a/lms/djangoapps/discussion/django_comment_client/utils.py
+++ b/lms/djangoapps/discussion/django_comment_client/utils.py
@@ -1038,7 +1038,7 @@ def sanitize_body(body):
    This is possibly overly broad, and might tamper with legitimate posts that
    contain this code in fenced code blocks. As far as we can tell, this is an
    extra layer of protection, and current handling in the front end and using
-    bleach for HTML rendering on the server side should cover these cases.
+    nh3 for HTML rendering on the server side should cover these cases.
    """
    if not body:
        return body
--- a/lms/djangoapps/discussion/rest_api/render.py
+++ b/lms/djangoapps/discussion/rest_api/render.py
@@ -4,17 +4,16 @@ Content rendering functionality
 Note that this module is designed to imitate the front end behavior as
 implemented in Markdown.Sanitizer.js.
 """
-import bleach
+import nh3
 import markdown

-ALLOWED_TAGS = bleach.ALLOWED_TAGS | {
+ALLOWED_TAGS = nh3.ALLOWED_TAGS | {
    'br', 'dd', 'del', 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 'hr', 'img', 'kbd', 'p', 'pre', 's',
    'strike', 'sub', 'sup', 'table', 'thead', 'th', 'tbody', 'tr', 'td', 'tfoot'
 }
-ALLOWED_PROTOCOLS = {"http", "https", "ftp", "mailto"}
 ALLOWED_ATTRIBUTES = {
-    "a": ["href", "title", "target", "rel"],
-    "img": ["src", "alt", "title", "width", "height"],
+    "a": {"href", "title", "target", "rel"},
+    "img": {"src", "alt", "title", "width", "height"},
 }


@@ -25,17 +24,16 @@ def render_body(raw_body):
    This includes the following steps:

    * Convert Markdown to HTML
-    * Sanitise HTML using bleach
+    * Sanitise HTML using nh3

    Note that this does not prevent Markdown syntax inside a MathJax block from
    being processed, which the forums JavaScript code does.
    """
    rendered_html = markdown.markdown(raw_body)
-    sanitised_html = bleach.clean(
+    sanitised_html = nh3.clean(
        rendered_html,
        tags=ALLOWED_TAGS,
-        protocols=ALLOWED_PROTOCOLS,
-        strip=True,
-        attributes=ALLOWED_ATTRIBUTES
+        attributes=ALLOWED_ATTRIBUTES,
+        link_rel=None,
    )
    return sanitised_html
--- a/lms/djangoapps/discussion/rest_api/tests/test_render.py
+++ b/lms/djangoapps/discussion/rest_api/tests/test_render.py
@@ -84,7 +84,7 @@ class RenderBodyTest(TestCase):

    def test_script_tag(self):
        raw_body = '<script type="text/javascript">alert("evil script");</script>'
-        assert render_body(raw_body) == 'alert("evil script");'
+        assert render_body(raw_body) == ''

    @ddt.data(
        ("br", '<p>foo<br>bar</p>'),  # br is allowed inside p
--- a/lms/templates/courseware/progress_graph.js
+++ b/lms/templates/courseware/progress_graph.js
@@ -1,6 +1,6 @@
 <%page args="grade_summary, grade_cutoffs, graph_div_id, show_grade_breakdown = True, show_grade_cutoffs = True, **kwargs"/>
 <%!
-    import bleach
+    import nh3
    import json
    import math
    import six
@@ -74,7 +74,7 @@ $(function () {
      ## allowing the display of such images, and remove any previously stored HTML
      ## to prevent ugly HTML from being shown to learners.
      ## xss-lint: disable=javascript-jquery-append
-      ticks.append( [tickIndex, bleach.clean(section['label'], tags=set(), strip=True)] )
+      ticks.append( [tickIndex, nh3.clean(section['label'], tags=set())] )

      if section['category'] in detail_tooltips:
          ## xss-lint: disable=javascript-jquery-append
--- a/lms/templates/lti.html
+++ b/lms/templates/lti.html
@@ -61,7 +61,7 @@ from django.utils.translation import gettext as _
 % if has_score and comment:
    <h4 class="hd hd-4 problem-feedback-label">${_("Feedback on your work from the grader:")}</h4>
    <div class="problem-feedback">
-        ## sanitized with bleach in view
+        ## sanitized with nh3 in view
        ${comment | n, decode.utf8}
    </div>
 % endif
--- a/openedx/core/djangoapps/debug/views.py
+++ b/openedx/core/djangoapps/debug/views.py
@@ -5,7 +5,7 @@ in a 404 error.
 """


-import bleach
+import nh3
 from django.http import HttpResponseNotFound
 from django.template import TemplateDoesNotExist
 from django.utils.translation import gettext as _
@@ -54,4 +54,4 @@ def show_reference_template(request, template):

        return render_to_response(template, context)
    except TemplateDoesNotExist:
-        return HttpResponseNotFound(f'Missing template {bleach.clean(template, strip=True)}')
+        return HttpResponseNotFound(f'Missing template {nh3.clean(template)}')
--- a/openedx/core/djangoapps/user_authn/views/logout.py
+++ b/openedx/core/djangoapps/user_authn/views/logout.py
@@ -5,7 +5,7 @@ import re
 import urllib.parse as parse  # pylint: disable=import-error
 from urllib.parse import parse_qs, urlsplit, urlunsplit  # pylint: disable=import-error

-import bleach
+import nh3
 from django.conf import settings
 from django.contrib.auth import logout
 from django.shortcuts import redirect
@@ -60,7 +60,7 @@ class LogoutView(TemplateView):
        #  >> /courses/course-v1:ARTS+D1+2018_T/course/
        #  to handle this scenario we need to encode our URL using quote_plus and then unquote it again.
        if target_url:
-            target_url = bleach.clean(parse.unquote(parse.quote_plus(target_url)))
+            target_url = nh3.clean(parse.unquote(parse.quote_plus(target_url)))

        use_target_url = target_url and is_safe_login_or_logout_redirect(
            redirect_to=target_url,
--- a/openedx/core/djangoapps/user_authn/views/tests/test_logout.py
+++ b/openedx/core/djangoapps/user_authn/views/tests/test_logout.py
@@ -5,7 +5,7 @@ Tests for logout
 import urllib
 from unittest import mock
 import ddt
-import bleach
+import nh3
 from django.conf import settings
 from django.test import TestCase
 from django.test.utils import override_settings
@@ -237,6 +237,6 @@ class LogoutTests(TestCase):
        )
        response = self.client.get(url, HTTP_HOST=host)
        expected = {
-            'target': bleach.clean(urllib.parse.unquote(redirect_url)),
+            'target': nh3.clean(urllib.parse.unquote(redirect_url)),
        }
        self.assertDictContainsSubset(expected, response.context_data)
--- a/openedx/core/djangolib/markup.py
+++ b/openedx/core/djangolib/markup.py
@@ -4,7 +4,7 @@ Utilities for use in Mako markup.


 import markupsafe
-import bleach
+import nh3
 from lxml.html.clean import Cleaner
 from mako.filters import decode

@@ -53,7 +53,7 @@ def strip_all_tags_but_br(string_to_strip):
        string_to_strip = ""

    string_to_strip = decode.utf8(string_to_strip)
-    string_to_strip = bleach.clean(string_to_strip, tags={'br'}, strip=True)
+    string_to_strip = nh3.clean(string_to_strip, tags={'br'})

    return HTML(string_to_strip)

--- a/requirements/edx/base.txt
+++ b/requirements/edx/base.txt
@@ -68,7 +68,6 @@ billiard==4.2.0
    # via celery
 bleach[css]==6.1.0
    # via
-    #   -r requirements/edx/kernel.in
    #   edx-enterprise
    #   lti-consumer-xblock
    #   openedx-django-wiki
@@ -728,6 +727,8 @@ newrelic==9.9.1
    # via
    #   -r requirements/edx/bundled.in
    #   edx-django-utils
+nh3==0.2.17
+    # via -r requirements/edx/kernel.in
 nltk==3.8.1
    # via chem
 nodeenv==1.8.0
--- a/requirements/edx/development.txt
+++ b/requirements/edx/development.txt
@@ -1259,6 +1259,10 @@ newrelic==9.9.1
    #   -r requirements/edx/doc.txt
    #   -r requirements/edx/testing.txt
    #   edx-django-utils
+nh3==0.2.17
+    # via
+    #   -r requirements/edx/doc.txt
+    #   -r requirements/edx/testing.txt
 nltk==3.8.1
    # via
    #   -r requirements/edx/doc.txt
--- a/requirements/edx/doc.txt
+++ b/requirements/edx/doc.txt
@@ -854,6 +854,8 @@ newrelic==9.9.1
    # via
    #   -r requirements/edx/base.txt
    #   edx-django-utils
+nh3==0.2.17
+    # via -r requirements/edx/base.txt
 nltk==3.8.1
    # via
    #   -r requirements/edx/base.txt
--- a/requirements/edx/kernel.in
+++ b/requirements/edx/kernel.in
@@ -24,7 +24,6 @@ acid-xblock                         # This XBlock is used for unit tests as well
 analytics-python                    # Used for Segment analytics
 attrs                               # Reduces boilerplate code involving class attributes
 Babel                               # Internationalization utilities, used for date formatting in a few places
-bleach[css]                         # Allowed-list-based HTML sanitizing library that escapes or strips markup and attributes; used for capa and LTI
 boto                                # Deprecated version of the AWS SDK; we should stop using this
 boto3                               # Amazon Web Services SDK for Python
 botocore                            # via boto3, s3transfer
@@ -110,6 +109,7 @@ Markdown                            # Convert text markup to HTML; used in capa
 meilisearch                         # Library to access Meilisearch search engine (will replace ElasticSearch)
 mongoengine                         # Object-document mapper for MongoDB, used in the LMS dashboard
 mysqlclient                         # Driver for the default production relational database
+nh3                                 # Python bindings to the ammonia (whitelist-based HTML sanitizing library); used for capa and LTI
 nodeenv                             # Utility for managing Node.js environments; we use this for deployments and testing
 oauthlib                            # OAuth specification support for authenticating via LTI or other Open edX services
 olxcleaner
--- a/requirements/edx/testing.txt
+++ b/requirements/edx/testing.txt
@@ -939,6 +939,8 @@ newrelic==9.9.1
    # via
    #   -r requirements/edx/base.txt
    #   edx-django-utils
+nh3==0.2.17
+    # via -r requirements/edx/base.txt
 nltk==3.8.1
    # via
    #   -r requirements/edx/base.txt
--- a/uwsgi.ini
+++ b/uwsgi.ini
--- a/xmodule/capa/inputtypes.py
+++ b/xmodule/capa/inputtypes.py
@@ -47,7 +47,7 @@ import sys
 import time
 from datetime import datetime

-import bleach
+import nh3
 import html5lib
 import pyparsing
 import six
@@ -800,7 +800,7 @@ class CodeInput(InputTypeBase):
        if self.status == 'incomplete':
            self.status = 'queued'
            self.queue_len = self.msg  # lint-amnesty, pylint: disable=attribute-defined-outside-init
-            self.msg = bleach.clean(self.submitted_msg)
+            self.msg = nh3.clean(self.submitted_msg)

    def setup(self):
        """ setup this input type """
--- a/xmodule/capa/tests/test_inputtypes.py
+++ b/xmodule/capa/tests/test_inputtypes.py
@@ -916,7 +916,7 @@ class MatlabTest(unittest.TestCase):
        }
        elt = etree.fromstring(self.xml)
        the_input = self.input_class(test_capa_system(), elt, state)
-        expected = "&lt;script&gt;Test message&lt;/script&gt;"
+        expected = ""
        assert the_input.queue_msg == expected

    def test_matlab_sanitize_msg(self):
@@ -925,7 +925,7 @@ class MatlabTest(unittest.TestCase):
        """
        not_allowed_tag = 'script'
        self.the_input.msg = "<{0}>Test message</{0}>".format(not_allowed_tag)
-        expected = "&lt;script&gt;Test message&lt;/script&gt;"
+        expected = ""
        assert self.the_input._get_render_context()['msg'] == expected  # pylint: disable=protected-access


--- a/xmodule/capa/tests/test_util.py
+++ b/xmodule/capa/tests/test_util.py
@@ -121,7 +121,7 @@ class UtilTest(unittest.TestCase):

    def test_sanitize_html(self):
        """
-        Test for html sanitization with bleach.
+        Test for html sanitization with nh3.
        """
        allowed_tags = ['div', 'p', 'audio', 'pre', 'span']
        for tag in allowed_tags:
@@ -130,7 +130,7 @@ class UtilTest(unittest.TestCase):

        not_allowed_tag = 'script'
        queue_msg = "<{0}>Test message</{0}>".format(not_allowed_tag)
-        expected = "&lt;script&gt;Test message&lt;/script&gt;"
+        expected = ""
        assert sanitize_html(queue_msg) == expected

    def test_get_inner_html_from_xpath(self):
@@ -142,7 +142,7 @@ class UtilTest(unittest.TestCase):

    def test_remove_markup(self):
        """
-        Test for markup removal with bleach.
+        Test for markup removal with nh3.
        """
        assert remove_markup('The <mark>Truth</mark> is <em>Out There</em> & you need to <strong>find</strong> it') ==\
               'The Truth is Out There &amp; you need to find it'
--- a/xmodule/capa/util.py
+++ b/xmodule/capa/util.py
@@ -8,11 +8,10 @@ import re
 from cmath import isinf, isnan
 from decimal import Decimal

-import bleach
+import nh3
 from calc import evaluator
 from lxml import etree

-from bleach.css_sanitizer import CSSSanitizer
 from openedx.core.djangolib.markup import HTML

 #-----------------------------------------------------------------------------
@@ -182,17 +181,15 @@ def sanitize_html(html_code):

    Used to sanitize XQueue responses from Matlab.
    """
-    attributes = bleach.ALLOWED_ATTRIBUTES.copy()
+    attributes = nh3.ALLOWED_ATTRIBUTES.copy()
    attributes.update({
-        '*': ['class', 'style', 'id'],
-        'audio': ['controls', 'autobuffer', 'autoplay', 'src'],
-        'img': ['src', 'width', 'height', 'class']
+        '*': {'class', 'style', 'id'},
+        'audio': {'controls', 'autobuffer', 'autoplay', 'src'},
+        'img': {'src', 'width', 'height', 'class'}
    })
-    output = bleach.clean(
+    output = nh3.clean(
        html_code,
-        protocols=bleach.ALLOWED_PROTOCOLS | {'data'},
-        tags=bleach.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'},
-        css_sanitizer=CSSSanitizer(allowed_css_properties=["white-space"]),
+        tags=nh3.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'},
        attributes=attributes
    )
    return output
@@ -215,12 +212,12 @@ def remove_markup(html):
    """
    Return html with markup stripped and text HTML-escaped.

-    >>> bleach.clean("<b>Rock & Roll</b>", tags=set(), strip=True)
+    >>> nh3.clean("<b>Rock & Roll</b>", tags=set())
    'Rock &amp; Roll'
-    >>> bleach.clean("<b>Rock &amp; Roll</b>", tags=set(), strip=True)
+    >>> nh3.clean("<b>Rock &amp; Roll</b>", tags=set())
    'Rock &amp; Roll'
    """
-    return HTML(bleach.clean(html, tags=set(), strip=True))
+    return HTML(nh3.clean(html, tags=set()))


 def get_course_id_from_capa_block(capa_block):
--- a/xmodule/capa_block.py
+++ b/xmodule/capa_block.py
@@ -14,7 +14,7 @@ import struct
 import sys
 import traceback

-from bleach.sanitizer import Cleaner
+import nh3
 from django.conf import settings
 from django.core.exceptions import ImproperlyConfigured
 from django.utils.encoding import smart_str
@@ -619,7 +619,7 @@ class ProblemBlock(
        capa_content = re.sub(
            r"(\s|&nbsp;|//)+",
            " ",
-            Cleaner(tags=[], strip=True).clean(capa_content)
+            nh3.clean(capa_content, tags=set())
        )

        capa_body = {
--- a/xmodule/library_content_block.py
+++ b/xmodule/library_content_block.py
@@ -9,7 +9,7 @@ import random
 from copy import copy
 from gettext import ngettext, gettext

-import bleach
+import nh3
 from django.conf import settings
 from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
 from django.utils.functional import classproperty
@@ -731,7 +731,7 @@ class LibraryContentBlock(
        lib_tools = self.get_tools()
        user_perms = self.runtime.service(self, 'studio_user_permissions')
        all_libraries = [
-            (key, bleach.clean(name)) for key, name in lib_tools.list_available_libraries()
+            (key, nh3.clean(name)) for key, name in lib_tools.list_available_libraries()
            if user_perms.can_read(key) or self.source_library_id == str(key)
        ]
        all_libraries.sort(key=lambda entry: entry[1])  # Sort by name
--- a/xmodule/lti_block.py
+++ b/xmodule/lti_block.py
@@ -63,7 +63,7 @@ from xml.sax.saxutils import escape
 from unittest import mock
 from urllib import parse

-import bleach
+import nh3
 import oauthlib.oauth1
 from django.conf import settings
 from lxml import etree
@@ -458,17 +458,43 @@ class LTIBlock(
        """
        Returns a context.
        """
-        # use bleach defaults. see https://github.com/jsocol/bleach/blob/master/bleach/__init__.py
+        # nh3 defaults for
        # ALLOWED_TAGS are
-        # ['a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol',  'strong', 'ul']
+        # {
+        #   'a', 'abbr', 'acronym', 'area', 'article', 'aside', 'b', 'bdi', 'bdo',
+        #   'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+        #   'data', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em', 'figcaption',
+        #   'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup',
+        #   'hr', 'i', 'img', 'ins', 'kbd', 'li', 'map', 'mark', 'nav', 'ol', 'p', 'pre',
+        #   'q', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', 'span', 'strike',
+        #   'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'th', 'thead',
+        #   'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr'
+        # }
        #
        # ALLOWED_ATTRIBUTES are
-        #     'a': ['href', 'title'],
-        #     'abbr': ['title'],
-        #     'acronym': ['title'],
+        # {
+        #   'a': {'href', 'hreflang'},
+        #   'bdo': {'dir'},
+        #   'blockquote': {'cite'},
+        #   'col': {'charoff', 'char', 'align', 'span'},
+        #   'colgroup': {'align', 'char', 'charoff', 'span'},
+        #   'del': {'datetime', 'cite'},
+        #   'hr': {'width', 'align', 'size'},
+        #   'img': {'height', 'src', 'width', 'alt', 'align'},
+        #   'ins': {'datetime', 'cite'},
+        #   'ol': {'start'},
+        #   'q': {'cite'},
+        #   'table': {'align', 'char', 'charoff', 'summary'},
+        #   'tbody': {'align', 'char', 'charoff'},
+        #   'td': {'rowspan', 'headers', 'charoff', 'colspan', 'char', 'align'},
+        #   'tfoot': {'align', 'char', 'charoff'},
+        #   'th': {'rowspan', 'headers', 'charoff', 'colspan', 'scope', 'char', 'align'},
+        #   'thead': {'charoff', 'char', 'align'},
+        #   'tr': {'align', 'char', 'charoff'}
+        # }
        #
        # This lets all plaintext through.
-        sanitized_comment = bleach.clean(self.score_comment)
+        sanitized_comment = nh3.clean(self.score_comment)

        return {
            'input_fields': self.get_input_fields(),
--- a/xmodule/tests/test_lti20_unit.py
+++ b/xmodule/tests/test_lti20_unit.py
@@ -45,7 +45,7 @@ class LTI20RESTResultServiceTest(unittest.TestCase):

        test_cases = (  # (before sanitize, after sanitize)
            ("plaintext", "plaintext"),
-            ("a <script>alert(3)</script>", "a &lt;script&gt;alert(3)&lt;/script&gt;"),  # encodes scripts
+            ("a <script>alert(3)</script>", "a "),  # drops scripts
            ("<b>bold 包</b>", "<b>bold 包</b>"),  # unicode, and <b> tags pass through
        )
        for case in test_cases: