chore: replace bleach with nh3

This commit is contained in:
Irtaza Akram
2024-05-28 13:02:16 +05:00
parent 63e940d65d
commit f7229e0aad
25 changed files with 92 additions and 62 deletions

View File

@@ -10,7 +10,7 @@ import logging
import urllib
from functools import wraps
import bleach
import nh3
from django.db import transaction
from django.db.models import Q
from django.http import HttpResponse, HttpResponseBadRequest, HttpResponseForbidden, HttpResponseServerError
@@ -88,8 +88,8 @@ def search_certificates(request):
]
"""
unbleached_filter = urllib.parse.unquote(urllib.parse.quote_plus(request.GET.get("user", "")))
user_filter = bleach.clean(unbleached_filter)
uncleaned_filter = urllib.parse.unquote(urllib.parse.quote_plus(request.GET.get("user", "")))
user_filter = nh3.clean(uncleaned_filter)
if not user_filter:
msg = _("user is not given.")
return HttpResponseBadRequest(msg)

View File

@@ -10,7 +10,7 @@ from collections import OrderedDict, namedtuple
from datetime import datetime
from urllib.parse import quote_plus, urlencode, urljoin, urlparse, urlunparse
import bleach
import nh3
import requests
from django.conf import settings
from django.contrib.auth.decorators import login_required
@@ -1550,7 +1550,7 @@ def render_xblock(request, usage_key_string, check_if_enrolled=True, disable_sta
requested_view = request.GET.get('view', 'student_view')
if requested_view != 'student_view' and requested_view != 'public_view': # lint-amnesty, pylint: disable=consider-using-in
return HttpResponseBadRequest(
f"Rendering of the xblock view '{bleach.clean(requested_view, strip=True)}' is not supported."
f"Rendering of the xblock view '{nh3.clean(requested_view)}' is not supported."
)
staff_access = has_access(request.user, 'staff', course_key)

View File

@@ -1038,7 +1038,7 @@ def sanitize_body(body):
This is possibly overly broad, and might tamper with legitimate posts that
contain this code in fenced code blocks. As far as we can tell, this is an
extra layer of protection, and current handling in the front end and using
bleach for HTML rendering on the server side should cover these cases.
nh3 for HTML rendering on the server side should cover these cases.
"""
if not body:
return body

View File

@@ -4,17 +4,16 @@ Content rendering functionality
Note that this module is designed to imitate the front end behavior as
implemented in Markdown.Sanitizer.js.
"""
import bleach
import nh3
import markdown
ALLOWED_TAGS = bleach.ALLOWED_TAGS | {
ALLOWED_TAGS = nh3.ALLOWED_TAGS | {
'br', 'dd', 'del', 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 'hr', 'img', 'kbd', 'p', 'pre', 's',
'strike', 'sub', 'sup', 'table', 'thead', 'th', 'tbody', 'tr', 'td', 'tfoot'
}
ALLOWED_PROTOCOLS = {"http", "https", "ftp", "mailto"}
ALLOWED_ATTRIBUTES = {
"a": ["href", "title", "target", "rel"],
"img": ["src", "alt", "title", "width", "height"],
"a": {"href", "title", "target", "rel"},
"img": {"src", "alt", "title", "width", "height"},
}
@@ -25,17 +24,16 @@ def render_body(raw_body):
This includes the following steps:
* Convert Markdown to HTML
* Sanitise HTML using bleach
* Sanitise HTML using nh3
Note that this does not prevent Markdown syntax inside a MathJax block from
being processed, which the forums JavaScript code does.
"""
rendered_html = markdown.markdown(raw_body)
sanitised_html = bleach.clean(
sanitised_html = nh3.clean(
rendered_html,
tags=ALLOWED_TAGS,
protocols=ALLOWED_PROTOCOLS,
strip=True,
attributes=ALLOWED_ATTRIBUTES
attributes=ALLOWED_ATTRIBUTES,
link_rel=None,
)
return sanitised_html

View File

@@ -84,7 +84,7 @@ class RenderBodyTest(TestCase):
def test_script_tag(self):
raw_body = '<script type="text/javascript">alert("evil script");</script>'
assert render_body(raw_body) == 'alert("evil script");'
assert render_body(raw_body) == ''
@ddt.data(
("br", '<p>foo<br>bar</p>'), # br is allowed inside p

View File

@@ -1,6 +1,6 @@
<%page args="grade_summary, grade_cutoffs, graph_div_id, show_grade_breakdown = True, show_grade_cutoffs = True, **kwargs"/>
<%!
import bleach
import nh3
import json
import math
import six
@@ -74,7 +74,7 @@ $(function () {
## allowing the display of such images, and remove any previously stored HTML
## to prevent ugly HTML from being shown to learners.
## xss-lint: disable=javascript-jquery-append
ticks.append( [tickIndex, bleach.clean(section['label'], tags=set(), strip=True)] )
ticks.append( [tickIndex, nh3.clean(section['label'], tags=set())] )
if section['category'] in detail_tooltips:
## xss-lint: disable=javascript-jquery-append

View File

@@ -61,7 +61,7 @@ from django.utils.translation import gettext as _
% if has_score and comment:
<h4 class="hd hd-4 problem-feedback-label">${_("Feedback on your work from the grader:")}</h4>
<div class="problem-feedback">
## sanitized with bleach in view
## sanitized with nh3 in view
${comment | n, decode.utf8}
</div>
% endif

View File

@@ -5,7 +5,7 @@ in a 404 error.
"""
import bleach
import nh3
from django.http import HttpResponseNotFound
from django.template import TemplateDoesNotExist
from django.utils.translation import gettext as _
@@ -54,4 +54,4 @@ def show_reference_template(request, template):
return render_to_response(template, context)
except TemplateDoesNotExist:
return HttpResponseNotFound(f'Missing template {bleach.clean(template, strip=True)}')
return HttpResponseNotFound(f'Missing template {nh3.clean(template)}')

View File

@@ -5,7 +5,7 @@ import re
import urllib.parse as parse # pylint: disable=import-error
from urllib.parse import parse_qs, urlsplit, urlunsplit # pylint: disable=import-error
import bleach
import nh3
from django.conf import settings
from django.contrib.auth import logout
from django.shortcuts import redirect
@@ -60,7 +60,7 @@ class LogoutView(TemplateView):
# >> /courses/course-v1:ARTS+D1+2018_T/course/
# to handle this scenario we need to encode our URL using quote_plus and then unquote it again.
if target_url:
target_url = bleach.clean(parse.unquote(parse.quote_plus(target_url)))
target_url = nh3.clean(parse.unquote(parse.quote_plus(target_url)))
use_target_url = target_url and is_safe_login_or_logout_redirect(
redirect_to=target_url,

View File

@@ -5,7 +5,7 @@ Tests for logout
import urllib
from unittest import mock
import ddt
import bleach
import nh3
from django.conf import settings
from django.test import TestCase
from django.test.utils import override_settings
@@ -237,6 +237,6 @@ class LogoutTests(TestCase):
)
response = self.client.get(url, HTTP_HOST=host)
expected = {
'target': bleach.clean(urllib.parse.unquote(redirect_url)),
'target': nh3.clean(urllib.parse.unquote(redirect_url)),
}
self.assertDictContainsSubset(expected, response.context_data)

View File

@@ -4,7 +4,7 @@ Utilities for use in Mako markup.
import markupsafe
import bleach
import nh3
from lxml.html.clean import Cleaner
from mako.filters import decode
@@ -53,7 +53,7 @@ def strip_all_tags_but_br(string_to_strip):
string_to_strip = ""
string_to_strip = decode.utf8(string_to_strip)
string_to_strip = bleach.clean(string_to_strip, tags={'br'}, strip=True)
string_to_strip = nh3.clean(string_to_strip, tags={'br'})
return HTML(string_to_strip)

View File

@@ -68,7 +68,6 @@ billiard==4.2.0
# via celery
bleach[css]==6.1.0
# via
# -r requirements/edx/kernel.in
# edx-enterprise
# lti-consumer-xblock
# openedx-django-wiki
@@ -728,6 +727,8 @@ newrelic==9.9.1
# via
# -r requirements/edx/bundled.in
# edx-django-utils
nh3==0.2.17
# via -r requirements/edx/kernel.in
nltk==3.8.1
# via chem
nodeenv==1.8.0

View File

@@ -1259,6 +1259,10 @@ newrelic==9.9.1
# -r requirements/edx/doc.txt
# -r requirements/edx/testing.txt
# edx-django-utils
nh3==0.2.17
# via
# -r requirements/edx/doc.txt
# -r requirements/edx/testing.txt
nltk==3.8.1
# via
# -r requirements/edx/doc.txt

View File

@@ -854,6 +854,8 @@ newrelic==9.9.1
# via
# -r requirements/edx/base.txt
# edx-django-utils
nh3==0.2.17
# via -r requirements/edx/base.txt
nltk==3.8.1
# via
# -r requirements/edx/base.txt

View File

@@ -24,7 +24,6 @@ acid-xblock # This XBlock is used for unit tests as well
analytics-python # Used for Segment analytics
attrs # Reduces boilerplate code involving class attributes
Babel # Internationalization utilities, used for date formatting in a few places
bleach[css] # Allowed-list-based HTML sanitizing library that escapes or strips markup and attributes; used for capa and LTI
boto # Deprecated version of the AWS SDK; we should stop using this
boto3 # Amazon Web Services SDK for Python
botocore # via boto3, s3transfer
@@ -110,6 +109,7 @@ Markdown # Convert text markup to HTML; used in capa
meilisearch # Library to access Meilisearch search engine (will replace ElasticSearch)
mongoengine # Object-document mapper for MongoDB, used in the LMS dashboard
mysqlclient # Driver for the default production relational database
nh3 # Python bindings to the ammonia (whitelist-based HTML sanitizing library); used for capa and LTI
nodeenv # Utility for managing Node.js environments; we use this for deployments and testing
oauthlib # OAuth specification support for authenticating via LTI or other Open edX services
olxcleaner

View File

@@ -939,6 +939,8 @@ newrelic==9.9.1
# via
# -r requirements/edx/base.txt
# edx-django-utils
nh3==0.2.17
# via -r requirements/edx/base.txt
nltk==3.8.1
# via
# -r requirements/edx/base.txt

0
uwsgi.ini Executable file
View File

View File

@@ -47,7 +47,7 @@ import sys
import time
from datetime import datetime
import bleach
import nh3
import html5lib
import pyparsing
import six
@@ -800,7 +800,7 @@ class CodeInput(InputTypeBase):
if self.status == 'incomplete':
self.status = 'queued'
self.queue_len = self.msg # lint-amnesty, pylint: disable=attribute-defined-outside-init
self.msg = bleach.clean(self.submitted_msg)
self.msg = nh3.clean(self.submitted_msg)
def setup(self):
""" setup this input type """

View File

@@ -916,7 +916,7 @@ class MatlabTest(unittest.TestCase):
}
elt = etree.fromstring(self.xml)
the_input = self.input_class(test_capa_system(), elt, state)
expected = "&lt;script&gt;Test message&lt;/script&gt;"
expected = ""
assert the_input.queue_msg == expected
def test_matlab_sanitize_msg(self):
@@ -925,7 +925,7 @@ class MatlabTest(unittest.TestCase):
"""
not_allowed_tag = 'script'
self.the_input.msg = "<{0}>Test message</{0}>".format(not_allowed_tag)
expected = "&lt;script&gt;Test message&lt;/script&gt;"
expected = ""
assert self.the_input._get_render_context()['msg'] == expected # pylint: disable=protected-access

View File

@@ -121,7 +121,7 @@ class UtilTest(unittest.TestCase):
def test_sanitize_html(self):
"""
Test for html sanitization with bleach.
Test for html sanitization with nh3.
"""
allowed_tags = ['div', 'p', 'audio', 'pre', 'span']
for tag in allowed_tags:
@@ -130,7 +130,7 @@ class UtilTest(unittest.TestCase):
not_allowed_tag = 'script'
queue_msg = "<{0}>Test message</{0}>".format(not_allowed_tag)
expected = "&lt;script&gt;Test message&lt;/script&gt;"
expected = ""
assert sanitize_html(queue_msg) == expected
def test_get_inner_html_from_xpath(self):
@@ -142,7 +142,7 @@ class UtilTest(unittest.TestCase):
def test_remove_markup(self):
"""
Test for markup removal with bleach.
Test for markup removal with nh3.
"""
assert remove_markup('The <mark>Truth</mark> is <em>Out There</em> & you need to <strong>find</strong> it') ==\
'The Truth is Out There &amp; you need to find it'

View File

@@ -8,11 +8,10 @@ import re
from cmath import isinf, isnan
from decimal import Decimal
import bleach
import nh3
from calc import evaluator
from lxml import etree
from bleach.css_sanitizer import CSSSanitizer
from openedx.core.djangolib.markup import HTML
#-----------------------------------------------------------------------------
@@ -182,17 +181,15 @@ def sanitize_html(html_code):
Used to sanitize XQueue responses from Matlab.
"""
attributes = bleach.ALLOWED_ATTRIBUTES.copy()
attributes = nh3.ALLOWED_ATTRIBUTES.copy()
attributes.update({
'*': ['class', 'style', 'id'],
'audio': ['controls', 'autobuffer', 'autoplay', 'src'],
'img': ['src', 'width', 'height', 'class']
'*': {'class', 'style', 'id'},
'audio': {'controls', 'autobuffer', 'autoplay', 'src'},
'img': {'src', 'width', 'height', 'class'}
})
output = bleach.clean(
output = nh3.clean(
html_code,
protocols=bleach.ALLOWED_PROTOCOLS | {'data'},
tags=bleach.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'},
css_sanitizer=CSSSanitizer(allowed_css_properties=["white-space"]),
tags=nh3.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'},
attributes=attributes
)
return output
@@ -215,12 +212,12 @@ def remove_markup(html):
"""
Return html with markup stripped and text HTML-escaped.
>>> bleach.clean("<b>Rock & Roll</b>", tags=set(), strip=True)
>>> nh3.clean("<b>Rock & Roll</b>", tags=set())
'Rock &amp; Roll'
>>> bleach.clean("<b>Rock &amp; Roll</b>", tags=set(), strip=True)
>>> nh3.clean("<b>Rock &amp; Roll</b>", tags=set())
'Rock &amp; Roll'
"""
return HTML(bleach.clean(html, tags=set(), strip=True))
return HTML(nh3.clean(html, tags=set()))
def get_course_id_from_capa_block(capa_block):

View File

@@ -14,7 +14,7 @@ import struct
import sys
import traceback
from bleach.sanitizer import Cleaner
import nh3
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from django.utils.encoding import smart_str
@@ -619,7 +619,7 @@ class ProblemBlock(
capa_content = re.sub(
r"(\s|&nbsp;|//)+",
" ",
Cleaner(tags=[], strip=True).clean(capa_content)
nh3.clean(capa_content, tags=set())
)
capa_body = {

View File

@@ -9,7 +9,7 @@ import random
from copy import copy
from gettext import ngettext, gettext
import bleach
import nh3
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
from django.utils.functional import classproperty
@@ -731,7 +731,7 @@ class LibraryContentBlock(
lib_tools = self.get_tools()
user_perms = self.runtime.service(self, 'studio_user_permissions')
all_libraries = [
(key, bleach.clean(name)) for key, name in lib_tools.list_available_libraries()
(key, nh3.clean(name)) for key, name in lib_tools.list_available_libraries()
if user_perms.can_read(key) or self.source_library_id == str(key)
]
all_libraries.sort(key=lambda entry: entry[1]) # Sort by name

View File

@@ -63,7 +63,7 @@ from xml.sax.saxutils import escape
from unittest import mock
from urllib import parse
import bleach
import nh3
import oauthlib.oauth1
from django.conf import settings
from lxml import etree
@@ -458,17 +458,43 @@ class LTIBlock(
"""
Returns a context.
"""
# use bleach defaults. see https://github.com/jsocol/bleach/blob/master/bleach/__init__.py
# nh3 defaults for
# ALLOWED_TAGS are
# ['a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol', 'strong', 'ul']
# {
# 'a', 'abbr', 'acronym', 'area', 'article', 'aside', 'b', 'bdi', 'bdo',
# 'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
# 'data', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em', 'figcaption',
# 'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup',
# 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'map', 'mark', 'nav', 'ol', 'p', 'pre',
# 'q', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', 'span', 'strike',
# 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'th', 'thead',
# 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr'
# }
#
# ALLOWED_ATTRIBUTES are
# 'a': ['href', 'title'],
# 'abbr': ['title'],
# 'acronym': ['title'],
# {
# 'a': {'href', 'hreflang'},
# 'bdo': {'dir'},
# 'blockquote': {'cite'},
# 'col': {'charoff', 'char', 'align', 'span'},
# 'colgroup': {'align', 'char', 'charoff', 'span'},
# 'del': {'datetime', 'cite'},
# 'hr': {'width', 'align', 'size'},
# 'img': {'height', 'src', 'width', 'alt', 'align'},
# 'ins': {'datetime', 'cite'},
# 'ol': {'start'},
# 'q': {'cite'},
# 'table': {'align', 'char', 'charoff', 'summary'},
# 'tbody': {'align', 'char', 'charoff'},
# 'td': {'rowspan', 'headers', 'charoff', 'colspan', 'char', 'align'},
# 'tfoot': {'align', 'char', 'charoff'},
# 'th': {'rowspan', 'headers', 'charoff', 'colspan', 'scope', 'char', 'align'},
# 'thead': {'charoff', 'char', 'align'},
# 'tr': {'align', 'char', 'charoff'}
# }
#
# This lets all plaintext through.
sanitized_comment = bleach.clean(self.score_comment)
sanitized_comment = nh3.clean(self.score_comment)
return {
'input_fields': self.get_input_fields(),

View File

@@ -45,7 +45,7 @@ class LTI20RESTResultServiceTest(unittest.TestCase):
test_cases = ( # (before sanitize, after sanitize)
("plaintext", "plaintext"),
("a <script>alert(3)</script>", "a &lt;script&gt;alert(3)&lt;/script&gt;"), # encodes scripts
("a <script>alert(3)</script>", "a "), # drops scripts
("<b>bold 包</b>", "<b>bold 包</b>"), # unicode, and <b> tags pass through
)
for case in test_cases: