From f8c5cecefcb816c90d944149082c536705ba6513 Mon Sep 17 00:00:00 2001 From: Muhammad Adeel Tajamul <77053848+muhammadadeeltajamul@users.noreply.github.com> Date: Fri, 11 Jul 2025 14:10:33 +0500 Subject: [PATCH] feat: added feature to remove spam urls from discussion content (#37007) --- .../discussion/rest_api/serializers.py | 40 +++++++++++++++++++ .../rest_api/tests/test_serializers.py | 28 ++++++++++++- lms/envs/common.py | 5 +++ lms/envs/devstack.py | 2 + 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/lms/djangoapps/discussion/rest_api/serializers.py b/lms/djangoapps/discussion/rest_api/serializers.py index ff0c656baf..b6587c0f36 100644 --- a/lms/djangoapps/discussion/rest_api/serializers.py +++ b/lms/djangoapps/discussion/rest_api/serializers.py @@ -1,6 +1,10 @@ """ Discussion API serializers """ +import html +import re + +from bs4 import BeautifulSoup from typing import Dict from urllib.parse import urlencode, urlunparse @@ -137,6 +141,41 @@ def _validate_privileged_access(context: Dict) -> bool: return course and is_requester_privileged +def filter_spam_urls_from_html(html_string): + """ + Filters out spam posts from html + """ + html_string = html.unescape(html_string) + soup = BeautifulSoup(html_string, "html.parser") + patterns = [] + for domain in settings.DISCUSSION_SPAM_URLS: + escaped = domain.replace(".", r"\.") + domain_pattern = rf"(\w+\.)*{escaped}(?:/\S*)*" + patterns.append(re.compile(rf"(https?://)?{domain_pattern}", re.IGNORECASE)) + spaced_parts = list(domain) + spaced_pattern = "".join( + rf"{re.escape(char)}(?:\s| |\u00A0)*" if char != "." else r"\.(?:\s| |\u00A0)*" + for char in spaced_parts + ) + spaced_pattern += r"(?:\/(?:\s| |\u00A0|\w)*)*" + patterns.append(re.compile(spaced_pattern, re.IGNORECASE)) + + for a_tag in soup.find_all("a", href=True): + href = a_tag.get('href') + if href: + if any(p.search(href) for p in patterns): + a_tag.replace_with(a_tag.get_text(strip=True)) + + for text_node in soup.find_all(string=True): + new_text = text_node + for p in patterns: + new_text = p.sub('', new_text) + if new_text != text_node: + text_node.replace_with(new_text.strip()) + + return str(soup) + + class _ContentSerializer(serializers.Serializer): # pylint: disable=abstract-method """ @@ -244,6 +283,7 @@ class _ContentSerializer(serializers.Serializer): """ if self._rendered_body is None: self._rendered_body = render_body(obj["body"]) + self._rendered_body = filter_spam_urls_from_html(self._rendered_body) return self._rendered_body def get_abuse_flagged(self, obj): diff --git a/lms/djangoapps/discussion/rest_api/tests/test_serializers.py b/lms/djangoapps/discussion/rest_api/tests/test_serializers.py index 73b195e02f..f7aa2d83ad 100644 --- a/lms/djangoapps/discussion/rest_api/tests/test_serializers.py +++ b/lms/djangoapps/discussion/rest_api/tests/test_serializers.py @@ -9,6 +9,7 @@ from urllib.parse import urlparse import ddt import httpretty from django.test.client import RequestFactory +from django.test.utils import override_settings from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase @@ -17,7 +18,12 @@ from xmodule.modulestore.tests.factories import CourseFactory from common.djangoapps.student.tests.factories import UserFactory from common.djangoapps.util.testing import UrlResetMixin from lms.djangoapps.discussion.django_comment_client.tests.utils import ForumsEnableMixin -from lms.djangoapps.discussion.rest_api.serializers import CommentSerializer, ThreadSerializer, get_context +from lms.djangoapps.discussion.rest_api.serializers import ( + CommentSerializer, + ThreadSerializer, + filter_spam_urls_from_html, + get_context +) from lms.djangoapps.discussion.rest_api.tests.utils import ( CommentsServiceMockMixin, make_minimal_cs_comment, @@ -1108,3 +1114,23 @@ class CommentSerializerDeserializationTest(ForumsEnableMixin, CommentsServiceMoc ) assert not serializer.is_valid() assert serializer.errors == {field: ['This field is not allowed in an update.']} + + +class FilterSpamTest(SharedModuleStoreTestCase): + """ + Tests for the filter_spam method + """ + @override_settings(DISCUSSION_SPAM_URLS=['example.com']) + def test_filter(self): + self.assertEqual( + filter_spam_urls_from_html('
abc
'), + '
abc
' + ) + self.assertEqual( + filter_spam_urls_from_html('
example.com/abc/def
'), + '
' + ) + self.assertEqual( + filter_spam_urls_from_html('
e x a m p l e . c o m / a b c / d e f
'), + '
' + ) diff --git a/lms/envs/common.py b/lms/envs/common.py index 5cea0f583c..b4f6c8b43f 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -5084,6 +5084,11 @@ DISCUSSIONS_MFE_FEEDBACK_URL = None # .. setting_description: Base URL of the exams dashboard micro-frontend for instructors. EXAMS_DASHBOARD_MICROFRONTEND_URL = None +# .. setting_name: DISCUSSION_SPAM_URLS +# .. setting_default: [] +# .. setting_description: Urls to filter from discussion content to avoid spam +DISCUSSION_SPAM_URLS = [] + # .. toggle_name: ENABLE_AUTHN_RESET_PASSWORD_HIBP_POLICY # .. toggle_implementation: DjangoSetting # .. toggle_default: False diff --git a/lms/envs/devstack.py b/lms/envs/devstack.py index e024e81edf..4785bdb042 100644 --- a/lms/envs/devstack.py +++ b/lms/envs/devstack.py @@ -394,6 +394,8 @@ DISCUSSIONS_MICROFRONTEND_URL = 'http://localhost:2002' ################### FRONTEND APPLICATION DISCUSSIONS FEEDBACK URL################### DISCUSSIONS_MFE_FEEDBACK_URL = None +DISCUSSION_SPAM_URLS = [] + ############## Docker based devstack settings ####################### FEATURES.update({