From 2613c8ef4e95c6131c9375932765c3ed0a294244 Mon Sep 17 00:00:00 2001 From: Navin Karkera Date: Thu, 6 Mar 2025 16:55:45 +0000 Subject: [PATCH] feat: preserve image alt text while converting html to text (#36333) Replace image with its alt text while converting html to plain text for indexing. --- xmodule/annotator_mixin.py | 7 +++++++ xmodule/tests/test_annotator_mixin.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/xmodule/annotator_mixin.py b/xmodule/annotator_mixin.py index 8e7617e1ca..b3aaefa31c 100644 --- a/xmodule/annotator_mixin.py +++ b/xmodule/annotator_mixin.py @@ -37,6 +37,13 @@ class MLStripper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method self.reset() self.fed = [] + def handle_starttag(self, tag, attrs): + if tag != 'img': + return + for attr in attrs: + if len(attr) >= 2 and attr[0] == 'alt': + self.fed.append(attr[1]) + def handle_data(self, data): """takes the data in separate chunks""" self.fed.append(data) diff --git a/xmodule/tests/test_annotator_mixin.py b/xmodule/tests/test_annotator_mixin.py index baaa7ad5d9..655d18ef34 100644 --- a/xmodule/tests/test_annotator_mixin.py +++ b/xmodule/tests/test_annotator_mixin.py @@ -22,6 +22,10 @@ class HelperFunctionTest(unittest.TestCase): sample_sourceurl = "http://video-js.zencoder.com/oceans-clip.mp4" sample_youtubeurl = "http://www.youtube.com/watch?v=yxLIu-scR9Y" sample_html = '

Testing here and not bolded here

' + # pylint: disable=line-too-long + sample_html_with_image_alt = '''

Testing here with image:

the alt text

''' + # pylint: disable=line-too-long + sample_html_with_no_image_alt = '''

Testing here with image:

''' def test_get_instructions(self): """ @@ -54,3 +58,13 @@ class HelperFunctionTest(unittest.TestCase): expectedtext = "Testing here and not bolded here" result = html_to_text(self.sample_html) assert expectedtext == result + + def test_html_image_with_alt_text(self): + expectedtext = "Testing here with image: the alt text" + result = html_to_text(self.sample_html_with_image_alt) + assert expectedtext == result + + def test_html_image_with_no_alt_text(self): + expectedtext = "Testing here with image: " + result = html_to_text(self.sample_html_with_no_image_alt) + assert expectedtext == result