fix: improve regex to exclude data: urls (#36415)
This commit is contained in:
@@ -1215,7 +1215,7 @@ def _get_urls(content):
|
||||
"""
|
||||
Finds and returns a list of URLs in the given content.
|
||||
Includes strings following 'href=' and 'src='.
|
||||
Excludes strings that are only '#'.
|
||||
Excludes strings that are only '#' or start with 'data:'.
|
||||
|
||||
Arguments:
|
||||
content (str): entire content of a block
|
||||
@@ -1223,7 +1223,7 @@ def _get_urls(content):
|
||||
Returns:
|
||||
list: urls
|
||||
"""
|
||||
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
|
||||
regex = r'\s+(?:href|src)=["\'](?!#|data:)([^"\']*)["\']'
|
||||
url_list = re.findall(regex, content)
|
||||
return url_list
|
||||
|
||||
|
||||
@@ -577,3 +577,28 @@ class CheckBrokenLinksTaskTest(ModuleStoreTestCase):
|
||||
expected,
|
||||
f"Failed for URL: {url}",
|
||||
)
|
||||
|
||||
def test_get_urls(self):
|
||||
"""Test _get_urls function for correct URL extraction."""
|
||||
|
||||
content = '''
|
||||
<a href="https://example.com">Link</a>
|
||||
<img src="https://images.com/pic.jpg">
|
||||
<link href="https://fonts.googleapis.com/css?family=Roboto">
|
||||
<a href="#">Home</a>
|
||||
<a href="https://validsite.com">Valid</a>
|
||||
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...">
|
||||
<a href="data:application/pdf;base64,JVBERi0xLjQK...">
|
||||
<a href="https://another-valid.com">Another</a>
|
||||
<p>No links here!</p>
|
||||
<img alt="Just an image without src">
|
||||
'''
|
||||
|
||||
expected = [
|
||||
"https://example.com",
|
||||
"https://images.com/pic.jpg",
|
||||
"https://fonts.googleapis.com/css?family=Roboto",
|
||||
"https://validsite.com",
|
||||
"https://another-valid.com"
|
||||
]
|
||||
self.assertEqual(_get_urls(content), expected)
|
||||
|
||||
Reference in New Issue
Block a user