From f8c5cecefcb816c90d944149082c536705ba6513 Mon Sep 17 00:00:00 2001 From: Muhammad Adeel Tajamul <77053848+muhammadadeeltajamul@users.noreply.github.com> Date: Fri, 11 Jul 2025 14:10:33 +0500 Subject: [PATCH] feat: added feature to remove spam urls from discussion content (#37007) --- .../discussion/rest_api/serializers.py | 40 +++++++++++++++++++ .../rest_api/tests/test_serializers.py | 28 ++++++++++++- lms/envs/common.py | 5 +++ lms/envs/devstack.py | 2 + 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/lms/djangoapps/discussion/rest_api/serializers.py b/lms/djangoapps/discussion/rest_api/serializers.py index ff0c656baf..b6587c0f36 100644 --- a/lms/djangoapps/discussion/rest_api/serializers.py +++ b/lms/djangoapps/discussion/rest_api/serializers.py @@ -1,6 +1,10 @@ """ Discussion API serializers """ +import html +import re + +from bs4 import BeautifulSoup from typing import Dict from urllib.parse import urlencode, urlunparse @@ -137,6 +141,41 @@ def _validate_privileged_access(context: Dict) -> bool: return course and is_requester_privileged +def filter_spam_urls_from_html(html_string): + """ + Filters out spam posts from html + """ + html_string = html.unescape(html_string) + soup = BeautifulSoup(html_string, "html.parser") + patterns = [] + for domain in settings.DISCUSSION_SPAM_URLS: + escaped = domain.replace(".", r"\.") + domain_pattern = rf"(\w+\.)*{escaped}(?:/\S*)*" + patterns.append(re.compile(rf"(https?://)?{domain_pattern}", re.IGNORECASE)) + spaced_parts = list(domain) + spaced_pattern = "".join( + rf"{re.escape(char)}(?:\s| |\u00A0)*" if char != "." else r"\.(?:\s| |\u00A0)*" + for char in spaced_parts + ) + spaced_pattern += r"(?:\/(?:\s| |\u00A0|\w)*)*" + patterns.append(re.compile(spaced_pattern, re.IGNORECASE)) + + for a_tag in soup.find_all("a", href=True): + href = a_tag.get('href') + if href: + if any(p.search(href) for p in patterns): + a_tag.replace_with(a_tag.get_text(strip=True)) + + for text_node in soup.find_all(string=True): + new_text = text_node + for p in patterns: + new_text = p.sub('', new_text) + if new_text != text_node: + text_node.replace_with(new_text.strip()) + + return str(soup) + + class _ContentSerializer(serializers.Serializer): # pylint: disable=abstract-method """ @@ -244,6 +283,7 @@ class _ContentSerializer(serializers.Serializer): """ if self._rendered_body is None: self._rendered_body = render_body(obj["body"]) + self._rendered_body = filter_spam_urls_from_html(self._rendered_body) return self._rendered_body def get_abuse_flagged(self, obj): diff --git a/lms/djangoapps/discussion/rest_api/tests/test_serializers.py b/lms/djangoapps/discussion/rest_api/tests/test_serializers.py index 73b195e02f..f7aa2d83ad 100644 --- a/lms/djangoapps/discussion/rest_api/tests/test_serializers.py +++ b/lms/djangoapps/discussion/rest_api/tests/test_serializers.py @@ -9,6 +9,7 @@ from urllib.parse import urlparse import ddt import httpretty from django.test.client import RequestFactory +from django.test.utils import override_settings from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase @@ -17,7 +18,12 @@ from xmodule.modulestore.tests.factories import CourseFactory from common.djangoapps.student.tests.factories import UserFactory from common.djangoapps.util.testing import UrlResetMixin from lms.djangoapps.discussion.django_comment_client.tests.utils import ForumsEnableMixin -from lms.djangoapps.discussion.rest_api.serializers import CommentSerializer, ThreadSerializer, get_context +from lms.djangoapps.discussion.rest_api.serializers import ( + CommentSerializer, + ThreadSerializer, + filter_spam_urls_from_html, + get_context +) from lms.djangoapps.discussion.rest_api.tests.utils import ( CommentsServiceMockMixin, make_minimal_cs_comment, @@ -1108,3 +1114,23 @@ class CommentSerializerDeserializationTest(ForumsEnableMixin, CommentsServiceMoc ) assert not serializer.is_valid() assert serializer.errors == {field: ['This field is not allowed in an update.']} + + +class FilterSpamTest(SharedModuleStoreTestCase): + """ + Tests for the filter_spam method + """ + @override_settings(DISCUSSION_SPAM_URLS=['example.com']) + def test_filter(self): + self.assertEqual( + filter_spam_urls_from_html('
'), + '