feat: preserve image alt text while converting html to text (#36333)

Replace image with its alt text while converting html to plain text for indexing.
This commit is contained in:
Navin Karkera
2025-03-06 16:55:45 +00:00
committed by GitHub
parent cc56e41d0c
commit 2613c8ef4e
2 changed files with 21 additions and 0 deletions

View File

@@ -37,6 +37,13 @@ class MLStripper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method
self.reset()
self.fed = []
def handle_starttag(self, tag, attrs):
if tag != 'img':
return
for attr in attrs:
if len(attr) >= 2 and attr[0] == 'alt':
self.fed.append(attr[1])
def handle_data(self, data):
"""takes the data in separate chunks"""
self.fed.append(data)

View File

@@ -22,6 +22,10 @@ class HelperFunctionTest(unittest.TestCase):
sample_sourceurl = "http://video-js.zencoder.com/oceans-clip.mp4"
sample_youtubeurl = "http://www.youtube.com/watch?v=yxLIu-scR9Y"
sample_html = '<p><b>Testing here</b> and not bolded here</p>'
# pylint: disable=line-too-long
sample_html_with_image_alt = '''<p>Testing here with image: </p><p><img src="/static/image.jpg" alt="the alt text" width="560" height="315" /></p>'''
# pylint: disable=line-too-long
sample_html_with_no_image_alt = '''<p>Testing here with image: </p><p><img src="/static/image.jpg" width="560" height="315" /></p>'''
def test_get_instructions(self):
"""
@@ -54,3 +58,13 @@ class HelperFunctionTest(unittest.TestCase):
expectedtext = "Testing here and not bolded here"
result = html_to_text(self.sample_html)
assert expectedtext == result
def test_html_image_with_alt_text(self):
expectedtext = "Testing here with image: the alt text"
result = html_to_text(self.sample_html_with_image_alt)
assert expectedtext == result
def test_html_image_with_no_alt_text(self):
expectedtext = "Testing here with image: "
result = html_to_text(self.sample_html_with_no_image_alt)
assert expectedtext == result