feat: preserve image alt text while converting html to text (#36333)
Replace image with its alt text while converting html to plain text for indexing.
This commit is contained in:
@@ -37,6 +37,13 @@ class MLStripper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method
|
||||
self.reset()
|
||||
self.fed = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag != 'img':
|
||||
return
|
||||
for attr in attrs:
|
||||
if len(attr) >= 2 and attr[0] == 'alt':
|
||||
self.fed.append(attr[1])
|
||||
|
||||
def handle_data(self, data):
|
||||
"""takes the data in separate chunks"""
|
||||
self.fed.append(data)
|
||||
|
||||
@@ -22,6 +22,10 @@ class HelperFunctionTest(unittest.TestCase):
|
||||
sample_sourceurl = "http://video-js.zencoder.com/oceans-clip.mp4"
|
||||
sample_youtubeurl = "http://www.youtube.com/watch?v=yxLIu-scR9Y"
|
||||
sample_html = '<p><b>Testing here</b> and not bolded here</p>'
|
||||
# pylint: disable=line-too-long
|
||||
sample_html_with_image_alt = '''<p>Testing here with image: </p><p><img src="/static/image.jpg" alt="the alt text" width="560" height="315" /></p>'''
|
||||
# pylint: disable=line-too-long
|
||||
sample_html_with_no_image_alt = '''<p>Testing here with image: </p><p><img src="/static/image.jpg" width="560" height="315" /></p>'''
|
||||
|
||||
def test_get_instructions(self):
|
||||
"""
|
||||
@@ -54,3 +58,13 @@ class HelperFunctionTest(unittest.TestCase):
|
||||
expectedtext = "Testing here and not bolded here"
|
||||
result = html_to_text(self.sample_html)
|
||||
assert expectedtext == result
|
||||
|
||||
def test_html_image_with_alt_text(self):
|
||||
expectedtext = "Testing here with image: the alt text"
|
||||
result = html_to_text(self.sample_html_with_image_alt)
|
||||
assert expectedtext == result
|
||||
|
||||
def test_html_image_with_no_alt_text(self):
|
||||
expectedtext = "Testing here with image: "
|
||||
result = html_to_text(self.sample_html_with_no_image_alt)
|
||||
assert expectedtext == result
|
||||
|
||||
Reference in New Issue
Block a user