fix: improve regex to exclude data: urls (#36415)

2025-03-20 19:33:10 +05:00
parent 314e604f4c
commit e9c52100ef
2 changed files with 27 additions and 2 deletions
--- a/cms/djangoapps/contentstore/tasks.py
+++ b/cms/djangoapps/contentstore/tasks.py
@@ -1215,7 +1215,7 @@ def _get_urls(content):
    """
    Finds and returns a list of URLs in the given content.
    Includes strings following 'href=' and 'src='.
-    Excludes strings that are only '#'.
+    Excludes strings that are only '#' or start with 'data:'.

    Arguments:
        content (str): entire content of a block
@@ -1223,7 +1223,7 @@ def _get_urls(content):
    Returns:
        list: urls
    """
-    regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
+    regex = r'\s+(?:href|src)=["\'](?!#|data:)([^"\']*)["\']'
    url_list = re.findall(regex, content)
    return url_list

--- a/cms/djangoapps/contentstore/tests/test_tasks.py
+++ b/cms/djangoapps/contentstore/tests/test_tasks.py
@@ -577,3 +577,28 @@ class CheckBrokenLinksTaskTest(ModuleStoreTestCase):
                expected,
                f"Failed for URL: {url}",
            )
+
+    def test_get_urls(self):
+        """Test _get_urls function for correct URL extraction."""
+
+        content = '''
+            <a href="https://example.com">Link</a>
+            <img src="https://images.com/pic.jpg">
+            <link href="https://fonts.googleapis.com/css?family=Roboto">
+            <a href="#">Home</a>
+            <a href="https://validsite.com">Valid</a>
+            <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...">
+            <a href="data:application/pdf;base64,JVBERi0xLjQK...">
+            <a href="https://another-valid.com">Another</a>
+            <p>No links here!</p>
+            <img alt="Just an image without src">
+        '''
+
+        expected = [
+            "https://example.com",
+            "https://images.com/pic.jpg",
+            "https://fonts.googleapis.com/css?family=Roboto",
+            "https://validsite.com",
+            "https://another-valid.com"
+        ]
+        self.assertEqual(_get_urls(content), expected)