From e9c52100ef3a017d0dac808d6b57d7907a4919de Mon Sep 17 00:00:00 2001 From: Muhammad Farhan <129956601+mfarhan943@users.noreply.github.com> Date: Thu, 20 Mar 2025 19:33:10 +0500 Subject: [PATCH] fix: improve regex to exclude data: urls (#36415) --- cms/djangoapps/contentstore/tasks.py | 4 +-- .../contentstore/tests/test_tasks.py | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 1945d29311..c9776c15e6 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1215,7 +1215,7 @@ def _get_urls(content): """ Finds and returns a list of URLs in the given content. Includes strings following 'href=' and 'src='. - Excludes strings that are only '#'. + Excludes strings that are only '#' or start with 'data:'. Arguments: content (str): entire content of a block @@ -1223,7 +1223,7 @@ def _get_urls(content): Returns: list: urls """ - regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' + regex = r'\s+(?:href|src)=["\'](?!#|data:)([^"\']*)["\']' url_list = re.findall(regex, content) return url_list diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index f04f488913..9eae1a518e 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -577,3 +577,28 @@ class CheckBrokenLinksTaskTest(ModuleStoreTestCase): expected, f"Failed for URL: {url}", ) + + def test_get_urls(self): + """Test _get_urls function for correct URL extraction.""" + + content = ''' + Link + + + Home + Valid + + + Another +

No links here!

+ Just an image without src + ''' + + expected = [ + "https://example.com", + "https://images.com/pic.jpg", + "https://fonts.googleapis.com/css?family=Roboto", + "https://validsite.com", + "https://another-valid.com" + ] + self.assertEqual(_get_urls(content), expected)