diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 1945d29311..c9776c15e6 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1215,7 +1215,7 @@ def _get_urls(content): """ Finds and returns a list of URLs in the given content. Includes strings following 'href=' and 'src='. - Excludes strings that are only '#'. + Excludes strings that are only '#' or start with 'data:'. Arguments: content (str): entire content of a block @@ -1223,7 +1223,7 @@ def _get_urls(content): Returns: list: urls """ - regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' + regex = r'\s+(?:href|src)=["\'](?!#|data:)([^"\']*)["\']' url_list = re.findall(regex, content) return url_list diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index f04f488913..9eae1a518e 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -577,3 +577,28 @@ class CheckBrokenLinksTaskTest(ModuleStoreTestCase): expected, f"Failed for URL: {url}", ) + + def test_get_urls(self): + """Test _get_urls function for correct URL extraction.""" + + content = ''' + Link + + + Home + Valid + + + Another +

No links here!

+ Just an image without src + ''' + + expected = [ + "https://example.com", + "https://images.com/pic.jpg", + "https://fonts.googleapis.com/css?family=Roboto", + "https://validsite.com", + "https://another-valid.com" + ] + self.assertEqual(_get_urls(content), expected)