From e9c52100ef3a017d0dac808d6b57d7907a4919de Mon Sep 17 00:00:00 2001
From: Muhammad Farhan <129956601+mfarhan943@users.noreply.github.com>
Date: Thu, 20 Mar 2025 19:33:10 +0500
Subject: [PATCH] fix: improve regex to exclude data: urls (#36415)
---
cms/djangoapps/contentstore/tasks.py | 4 +--
.../contentstore/tests/test_tasks.py | 25 +++++++++++++++++++
2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py
index 1945d29311..c9776c15e6 100644
--- a/cms/djangoapps/contentstore/tasks.py
+++ b/cms/djangoapps/contentstore/tasks.py
@@ -1215,7 +1215,7 @@ def _get_urls(content):
"""
Finds and returns a list of URLs in the given content.
Includes strings following 'href=' and 'src='.
- Excludes strings that are only '#'.
+ Excludes strings that are only '#' or start with 'data:'.
Arguments:
content (str): entire content of a block
@@ -1223,7 +1223,7 @@ def _get_urls(content):
Returns:
list: urls
"""
- regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
+ regex = r'\s+(?:href|src)=["\'](?!#|data:)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list
diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py
index f04f488913..9eae1a518e 100644
--- a/cms/djangoapps/contentstore/tests/test_tasks.py
+++ b/cms/djangoapps/contentstore/tests/test_tasks.py
@@ -577,3 +577,28 @@ class CheckBrokenLinksTaskTest(ModuleStoreTestCase):
expected,
f"Failed for URL: {url}",
)
+
+ def test_get_urls(self):
+ """Test _get_urls function for correct URL extraction."""
+
+ content = '''
+ Link
+
+
+ Home
+ Valid
+
+
+ Another
+
No links here!
+