From 4fb6105d699e19321f6799d7fff05313fd4cd4b9 Mon Sep 17 00:00:00 2001
From: Markus Heiser
Date: Mon, 18 Aug 2025 16:30:51 +0200
Subject: [PATCH] [fix] revision of utils.HTMLTextExtractor (#5125)
Related:
- https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
---
searx/utils.py | 24 ++++++------
tests/unit/test_utils.py | 82 ++++++++++++++--------------------------
2 files changed, 41 insertions(+), 65 deletions(-)
diff --git a/searx/utils.py b/searx/utils.py
index 54b32484e..dff3eb4f4 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
-class _HTMLTextExtractorException(Exception):
- """Internal exception raised when the HTML is invalid"""
-
-
-class _HTMLTextExtractor(HTMLParser):
+class HTMLTextExtractor(HTMLParser):
"""Internal class to extract text from HTML"""
def __init__(self):
@@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
return
if tag != self.tags[-1]:
- raise _HTMLTextExtractorException()
+ self.result.append(f"{tag}>")
+ return
self.tags.pop()
@@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('Example')
'Example'
- >>> html_to_text(r'regexp: (?>> html_to_text(r'regexp: (?<![a-zA-Z]')
'regexp: (?>> html_to_text(r'Lorem ipsum dolor sit amet
')
+ 'Lorem ipsum dolor sit amet
'
+
+ >>> html_to_text(r'> < a')
+ '> < a'
+
"""
if not html_str:
return ""
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split())
- s = _HTMLTextExtractor()
+ s = HTMLTextExtractor()
try:
s.feed(html_str)
s.close()
except AssertionError:
- s = _HTMLTextExtractor()
+ s = HTMLTextExtractor()
s.feed(escape(html_str, quote=True))
s.close()
- except _HTMLTextExtractorException:
- logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
return s.get_text()
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index ad6ca37a5..01056df74 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
self.assertIsNotNone(utils.searxng_useragent())
self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
- def test_html_to_text(self):
- html_str = """
-
-
-
-
-
-
-
-
- Test text
-
-
-
- """
- self.assertIsInstance(utils.html_to_text(html_str), str)
- self.assertIsNotNone(utils.html_to_text(html_str))
- self.assertEqual(utils.html_to_text(html_str), "Test text")
- self.assertEqual(utils.html_to_text(r"regexp: (?
@@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
with self.assertRaises(Exception):
utils.extract_url([], 'https://example.com')
- def test_html_to_text_invalid(self):
- _html = 'Lorem ipsumdolor sit amet
'
- self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
-
def test_ecma_unscape(self):
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
-
-class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring
-
- def setUp(self):
- super().setUp()
-
- self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access
-
- def test__init__(self):
- self.assertEqual(self.html_text_extractor.result, [])
-
@parameterized.expand(
[
- ('xF', '\x0f'),
- ('XF', '\x0f'),
- ('97', 'a'),
+ ('Example #2', 'Example #2'),
+ ('Example', 'Example'),
+ (r'regexp: (?<![a-zA-Z]', r'regexp: (?Lorem ipsum dolor sit amet', 'Lorem ipsum dolor sit amet'),
+ (r'> < a', '> < a'),
]
)
- def test_handle_charref(self, charref: str, expected: str):
- self.html_text_extractor.handle_charref(charref)
- self.assertIn(expected, self.html_text_extractor.result)
+ def test_html_to_text(self, html_str: str, text_str: str):
+ self.assertEqual(utils.html_to_text(html_str), text_str)
- def test_handle_entityref(self):
- entity = 'test'
- self.html_text_extractor.handle_entityref(entity)
- self.assertIn(entity, self.html_text_extractor.result)
-
- def test_invalid_html(self):
- text = 'Lorem ipsumdolor sit amet
'
- with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access
- self.html_text_extractor.feed(text)
+ def test_html_to_text_with_a_style_span(self):
+ html_str = """
+
+
+
+
+
+
+
+
+ Test text
+
+
+
+ """
+ self.assertIsInstance(utils.html_to_text(html_str), str)
+ self.assertEqual(utils.html_to_text(html_str), "Test text")
class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring