fix: improve regex to exclude data: urls (#36415)

This commit is contained in:
Muhammad Farhan
2025-03-20 19:33:10 +05:00
committed by GitHub
parent 314e604f4c
commit e9c52100ef
2 changed files with 27 additions and 2 deletions

View File

@@ -1215,7 +1215,7 @@ def _get_urls(content):
"""
Finds and returns a list of URLs in the given content.
Includes strings following 'href=' and 'src='.
Excludes strings that are only '#'.
Excludes strings that are only '#' or start with 'data:'.
Arguments:
content (str): entire content of a block
@@ -1223,7 +1223,7 @@ def _get_urls(content):
Returns:
list: urls
"""
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
regex = r'\s+(?:href|src)=["\'](?!#|data:)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list

View File

@@ -577,3 +577,28 @@ class CheckBrokenLinksTaskTest(ModuleStoreTestCase):
expected,
f"Failed for URL: {url}",
)
def test_get_urls(self):
"""Test _get_urls function for correct URL extraction."""
content = '''
<a href="https://example.com">Link</a>
<img src="https://images.com/pic.jpg">
<link href="https://fonts.googleapis.com/css?family=Roboto">
<a href="#">Home</a>
<a href="https://validsite.com">Valid</a>
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...">
<a href="data:application/pdf;base64,JVBERi0xLjQK...">
<a href="https://another-valid.com">Another</a>
<p>No links here!</p>
<img alt="Just an image without src">
'''
expected = [
"https://example.com",
"https://images.com/pic.jpg",
"https://fonts.googleapis.com/css?family=Roboto",
"https://validsite.com",
"https://another-valid.com"
]
self.assertEqual(_get_urls(content), expected)