feat: added feature to remove spam urls from discussion content (#37007)
This commit is contained in:
committed by
GitHub
parent
84f82477ed
commit
f8c5cecefc
@@ -1,6 +1,10 @@
|
||||
"""
|
||||
Discussion API serializers
|
||||
"""
|
||||
import html
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict
|
||||
from urllib.parse import urlencode, urlunparse
|
||||
|
||||
@@ -137,6 +141,41 @@ def _validate_privileged_access(context: Dict) -> bool:
|
||||
return course and is_requester_privileged
|
||||
|
||||
|
||||
def filter_spam_urls_from_html(html_string):
|
||||
"""
|
||||
Filters out spam posts from html
|
||||
"""
|
||||
html_string = html.unescape(html_string)
|
||||
soup = BeautifulSoup(html_string, "html.parser")
|
||||
patterns = []
|
||||
for domain in settings.DISCUSSION_SPAM_URLS:
|
||||
escaped = domain.replace(".", r"\.")
|
||||
domain_pattern = rf"(\w+\.)*{escaped}(?:/\S*)*"
|
||||
patterns.append(re.compile(rf"(https?://)?{domain_pattern}", re.IGNORECASE))
|
||||
spaced_parts = list(domain)
|
||||
spaced_pattern = "".join(
|
||||
rf"{re.escape(char)}(?:\s| |\u00A0)*" if char != "." else r"\.(?:\s| |\u00A0)*"
|
||||
for char in spaced_parts
|
||||
)
|
||||
spaced_pattern += r"(?:\/(?:\s| |\u00A0|\w)*)*"
|
||||
patterns.append(re.compile(spaced_pattern, re.IGNORECASE))
|
||||
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag.get('href')
|
||||
if href:
|
||||
if any(p.search(href) for p in patterns):
|
||||
a_tag.replace_with(a_tag.get_text(strip=True))
|
||||
|
||||
for text_node in soup.find_all(string=True):
|
||||
new_text = text_node
|
||||
for p in patterns:
|
||||
new_text = p.sub('', new_text)
|
||||
if new_text != text_node:
|
||||
text_node.replace_with(new_text.strip())
|
||||
|
||||
return str(soup)
|
||||
|
||||
|
||||
class _ContentSerializer(serializers.Serializer):
|
||||
# pylint: disable=abstract-method
|
||||
"""
|
||||
@@ -244,6 +283,7 @@ class _ContentSerializer(serializers.Serializer):
|
||||
"""
|
||||
if self._rendered_body is None:
|
||||
self._rendered_body = render_body(obj["body"])
|
||||
self._rendered_body = filter_spam_urls_from_html(self._rendered_body)
|
||||
return self._rendered_body
|
||||
|
||||
def get_abuse_flagged(self, obj):
|
||||
|
||||
@@ -9,6 +9,7 @@ from urllib.parse import urlparse
|
||||
import ddt
|
||||
import httpretty
|
||||
from django.test.client import RequestFactory
|
||||
from django.test.utils import override_settings
|
||||
from xmodule.modulestore import ModuleStoreEnum
|
||||
from xmodule.modulestore.django import modulestore
|
||||
from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase
|
||||
@@ -17,7 +18,12 @@ from xmodule.modulestore.tests.factories import CourseFactory
|
||||
from common.djangoapps.student.tests.factories import UserFactory
|
||||
from common.djangoapps.util.testing import UrlResetMixin
|
||||
from lms.djangoapps.discussion.django_comment_client.tests.utils import ForumsEnableMixin
|
||||
from lms.djangoapps.discussion.rest_api.serializers import CommentSerializer, ThreadSerializer, get_context
|
||||
from lms.djangoapps.discussion.rest_api.serializers import (
|
||||
CommentSerializer,
|
||||
ThreadSerializer,
|
||||
filter_spam_urls_from_html,
|
||||
get_context
|
||||
)
|
||||
from lms.djangoapps.discussion.rest_api.tests.utils import (
|
||||
CommentsServiceMockMixin,
|
||||
make_minimal_cs_comment,
|
||||
@@ -1108,3 +1114,23 @@ class CommentSerializerDeserializationTest(ForumsEnableMixin, CommentsServiceMoc
|
||||
)
|
||||
assert not serializer.is_valid()
|
||||
assert serializer.errors == {field: ['This field is not allowed in an update.']}
|
||||
|
||||
|
||||
class FilterSpamTest(SharedModuleStoreTestCase):
|
||||
"""
|
||||
Tests for the filter_spam method
|
||||
"""
|
||||
@override_settings(DISCUSSION_SPAM_URLS=['example.com'])
|
||||
def test_filter(self):
|
||||
self.assertEqual(
|
||||
filter_spam_urls_from_html('<div><a href="example.com/abc/def">abc</a></div>'),
|
||||
'<div>abc</div>'
|
||||
)
|
||||
self.assertEqual(
|
||||
filter_spam_urls_from_html('<div>example.com/abc/def</div>'),
|
||||
'<div></div>'
|
||||
)
|
||||
self.assertEqual(
|
||||
filter_spam_urls_from_html('<div>e x a m p l e . c o m / a b c / d e f</div>'),
|
||||
'<div></div>'
|
||||
)
|
||||
|
||||
@@ -5084,6 +5084,11 @@ DISCUSSIONS_MFE_FEEDBACK_URL = None
|
||||
# .. setting_description: Base URL of the exams dashboard micro-frontend for instructors.
|
||||
EXAMS_DASHBOARD_MICROFRONTEND_URL = None
|
||||
|
||||
# .. setting_name: DISCUSSION_SPAM_URLS
|
||||
# .. setting_default: []
|
||||
# .. setting_description: Urls to filter from discussion content to avoid spam
|
||||
DISCUSSION_SPAM_URLS = []
|
||||
|
||||
# .. toggle_name: ENABLE_AUTHN_RESET_PASSWORD_HIBP_POLICY
|
||||
# .. toggle_implementation: DjangoSetting
|
||||
# .. toggle_default: False
|
||||
|
||||
@@ -394,6 +394,8 @@ DISCUSSIONS_MICROFRONTEND_URL = 'http://localhost:2002'
|
||||
################### FRONTEND APPLICATION DISCUSSIONS FEEDBACK URL###################
|
||||
DISCUSSIONS_MFE_FEEDBACK_URL = None
|
||||
|
||||
DISCUSSION_SPAM_URLS = []
|
||||
|
||||
############## Docker based devstack settings #######################
|
||||
|
||||
FEATURES.update({
|
||||
|
||||
Reference in New Issue
Block a user