From 0a8ec0caab57868dfd100b5ae2d6b97ab0f637ed Mon Sep 17 00:00:00 2001 From: Muhammad Adeel Tajamul <77053848+muhammadadeeltajamul@users.noreply.github.com> Date: Fri, 11 Jul 2025 22:54:05 +0500 Subject: [PATCH] temp: added option to replace spam content with text (#37009) --- lms/djangoapps/discussion/rest_api/serializers.py | 11 +++++++++-- .../discussion/rest_api/tests/test_serializers.py | 6 +++--- lms/envs/common.py | 5 +++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lms/djangoapps/discussion/rest_api/serializers.py b/lms/djangoapps/discussion/rest_api/serializers.py index b6587c0f36..75ef1f3d51 100644 --- a/lms/djangoapps/discussion/rest_api/serializers.py +++ b/lms/djangoapps/discussion/rest_api/serializers.py @@ -144,10 +144,13 @@ def _validate_privileged_access(context: Dict) -> bool: def filter_spam_urls_from_html(html_string): """ Filters out spam posts from html + Returns: + clean_post, is_spam """ html_string = html.unescape(html_string) soup = BeautifulSoup(html_string, "html.parser") patterns = [] + is_spam = False for domain in settings.DISCUSSION_SPAM_URLS: escaped = domain.replace(".", r"\.") domain_pattern = rf"(\w+\.)*{escaped}(?:/\S*)*" @@ -165,6 +168,7 @@ def filter_spam_urls_from_html(html_string): if href: if any(p.search(href) for p in patterns): a_tag.replace_with(a_tag.get_text(strip=True)) + is_spam = True for text_node in soup.find_all(string=True): new_text = text_node @@ -172,8 +176,9 @@ def filter_spam_urls_from_html(html_string): new_text = p.sub('', new_text) if new_text != text_node: text_node.replace_with(new_text.strip()) + is_spam = True - return str(soup) + return str(soup), is_spam class _ContentSerializer(serializers.Serializer): @@ -283,7 +288,9 @@ class _ContentSerializer(serializers.Serializer): """ if self._rendered_body is None: self._rendered_body = render_body(obj["body"]) - self._rendered_body = filter_spam_urls_from_html(self._rendered_body) + self._rendered_body, is_spam = filter_spam_urls_from_html(self._rendered_body) + if is_spam and settings.CONTENT_FOR_SPAM_POSTS: + self._rendered_body = settings.CONTENT_FOR_SPAM_POSTS return self._rendered_body def get_abuse_flagged(self, obj): diff --git a/lms/djangoapps/discussion/rest_api/tests/test_serializers.py b/lms/djangoapps/discussion/rest_api/tests/test_serializers.py index f7aa2d83ad..0333c62d73 100644 --- a/lms/djangoapps/discussion/rest_api/tests/test_serializers.py +++ b/lms/djangoapps/discussion/rest_api/tests/test_serializers.py @@ -1123,14 +1123,14 @@ class FilterSpamTest(SharedModuleStoreTestCase): @override_settings(DISCUSSION_SPAM_URLS=['example.com']) def test_filter(self): self.assertEqual( - filter_spam_urls_from_html('
abc
'), + filter_spam_urls_from_html('
abc
')[0], '
abc
' ) self.assertEqual( - filter_spam_urls_from_html('
example.com/abc/def
'), + filter_spam_urls_from_html('
example.com/abc/def
')[0], '
' ) self.assertEqual( - filter_spam_urls_from_html('
e x a m p l e . c o m / a b c / d e f
'), + filter_spam_urls_from_html('
e x a m p l e . c o m / a b c / d e f
')[0], '
' ) diff --git a/lms/envs/common.py b/lms/envs/common.py index b4f6c8b43f..ba77c23afb 100644 --- a/lms/envs/common.py +++ b/lms/envs/common.py @@ -5089,6 +5089,11 @@ EXAMS_DASHBOARD_MICROFRONTEND_URL = None # .. setting_description: Urls to filter from discussion content to avoid spam DISCUSSION_SPAM_URLS = [] +# .. setting_name: CONTENT_FOR_SPAM_POSTS +# .. setting_default: "" +# .. setting_description: Content to replace spam posts with +CONTENT_FOR_SPAM_POSTS = "" + # .. toggle_name: ENABLE_AUTHN_RESET_PASSWORD_HIBP_POLICY # .. toggle_implementation: DjangoSetting # .. toggle_default: False