diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py
index 1945d29311..c9776c15e6 100644
--- a/cms/djangoapps/contentstore/tasks.py
+++ b/cms/djangoapps/contentstore/tasks.py
@@ -1215,7 +1215,7 @@ def _get_urls(content):
"""
Finds and returns a list of URLs in the given content.
Includes strings following 'href=' and 'src='.
- Excludes strings that are only '#'.
+ Excludes strings that are only '#' or start with 'data:'.
Arguments:
content (str): entire content of a block
@@ -1223,7 +1223,7 @@ def _get_urls(content):
Returns:
list: urls
"""
- regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
+ regex = r'\s+(?:href|src)=["\'](?!#|data:)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list
diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py
index f04f488913..9eae1a518e 100644
--- a/cms/djangoapps/contentstore/tests/test_tasks.py
+++ b/cms/djangoapps/contentstore/tests/test_tasks.py
@@ -577,3 +577,28 @@ class CheckBrokenLinksTaskTest(ModuleStoreTestCase):
expected,
f"Failed for URL: {url}",
)
+
+ def test_get_urls(self):
+ """Test _get_urls function for correct URL extraction."""
+
+ content = '''
+ Link
+
+
+ Home
+ Valid
+
+
+ Another
+
No links here!
+