[fix] revision of utils.HTMLTextExtractor (#5125)

Related:

- https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
This commit is contained in:
Markus Heiser 2025-08-18 16:30:51 +02:00 committed by GitHub
parent b606103352
commit 4fb6105d69
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 41 additions and 65 deletions

View file

@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])) return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
class _HTMLTextExtractorException(Exception): class HTMLTextExtractor(HTMLParser):
"""Internal exception raised when the HTML is invalid"""
class _HTMLTextExtractor(HTMLParser):
"""Internal class to extract text from HTML""" """Internal class to extract text from HTML"""
def __init__(self): def __init__(self):
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
return return
if tag != self.tags[-1]: if tag != self.tags[-1]:
raise _HTMLTextExtractorException() self.result.append(f"</{tag}>")
return
self.tags.pop() self.tags.pop()
@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example' 'Example'
>>> html_to_text(r'regexp: (?<![a-zA-Z]') >>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
'regexp: (?<![a-zA-Z]' 'regexp: (?<![a-zA-Z]'
>>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
'Lorem ipsum </i>dolor sit amet</p>'
>>> html_to_text(r'&#x3e &#x3c &#97')
'> < a'
""" """
if not html_str: if not html_str:
return "" return ""
html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split()) html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor() s = HTMLTextExtractor()
try: try:
s.feed(html_str) s.feed(html_str)
s.close() s.close()
except AssertionError: except AssertionError:
s = _HTMLTextExtractor() s = HTMLTextExtractor()
s.feed(escape(html_str, quote=True)) s.feed(escape(html_str, quote=True))
s.close() s.close()
except _HTMLTextExtractorException:
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
return s.get_text() return s.get_text()

View file

@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
self.assertIsNotNone(utils.searxng_useragent()) self.assertIsNotNone(utils.searxng_useragent())
self.assertTrue(utils.searxng_useragent().startswith('SearXNG')) self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
def test_html_to_text(self):
html_str = """
<a href="/testlink" class="link_access_account">
<style>
.toto {
color: red;
}
</style>
<span class="toto">
<span>
<img src="test.jpg" />
</span>
</span>
<span class="titi">
Test text
</span>
<script>value='dummy';</script>
</a>
"""
self.assertIsInstance(utils.html_to_text(html_str), str)
self.assertIsNotNone(utils.html_to_text(html_str))
self.assertEqual(utils.html_to_text(html_str), "Test text")
self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
def test_extract_text(self): def test_extract_text(self):
html_str = """ html_str = """
<a href="/testlink" class="link_access_account"> <a href="/testlink" class="link_access_account">
@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
with self.assertRaises(Exception): with self.assertRaises(Exception):
utils.extract_url([], 'https://example.com') utils.extract_url([], 'https://example.com')
def test_html_to_text_invalid(self):
_html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
def test_ecma_unscape(self): def test_ecma_unscape(self):
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界') self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring
def setUp(self):
super().setUp()
self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access
def test__init__(self):
self.assertEqual(self.html_text_extractor.result, [])
@parameterized.expand( @parameterized.expand(
[ [
('xF', '\x0f'), ('Example <span id="42">#2</span>', 'Example #2'),
('XF', '\x0f'), ('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
('97', 'a'), (r'regexp: (?&lt;![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
(r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
(r'&#x3e &#x3c &#97', '> < a'),
] ]
) )
def test_handle_charref(self, charref: str, expected: str): def test_html_to_text(self, html_str: str, text_str: str):
self.html_text_extractor.handle_charref(charref) self.assertEqual(utils.html_to_text(html_str), text_str)
self.assertIn(expected, self.html_text_extractor.result)
def test_handle_entityref(self): def test_html_to_text_with_a_style_span(self):
entity = 'test' html_str = """
self.html_text_extractor.handle_entityref(entity) <a href="/testlink" class="link_access_account">
self.assertIn(entity, self.html_text_extractor.result) <style>
.toto {
def test_invalid_html(self): color: red;
text = '<p><b>Lorem ipsum</i>dolor sit amet</p>' }
with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access </style>
self.html_text_extractor.feed(text) <span class="toto">
<span>
<img src="test.jpg" />
</span>
</span>
<span class="titi">
Test text
</span>
<script>value='dummy';</script>
</a>
"""
self.assertIsInstance(utils.html_to_text(html_str), str)
self.assertEqual(utils.html_to_text(html_str), "Test text")
class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring