From 4fb6105d699e19321f6799d7fff05313fd4cd4b9 Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarIT.de>
Date: Mon, 18 Aug 2025 16:30:51 +0200
Subject: [PATCH] [fix] revision of utils.HTMLTextExtractor (#5125)

Related:

- https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
---
 searx/utils.py           | 24 ++++++------
 tests/unit/test_utils.py | 82 ++++++++++++++--------------------------
 2 files changed, 41 insertions(+), 65 deletions(-)
diff --git a/searx/utils.py b/searx/utils.py
index 54b32484e..dff3eb4f4 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
     return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
 
 
-class _HTMLTextExtractorException(Exception):
-    """Internal exception raised when the HTML is invalid"""
-
-
-class _HTMLTextExtractor(HTMLParser):
+class HTMLTextExtractor(HTMLParser):
     """Internal class to extract text from HTML"""
 
     def __init__(self):
@@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
             return
 
         if tag != self.tags[-1]:
-            raise _HTMLTextExtractorException()
+            self.result.append(f"</{tag}>")
+            return
 
         self.tags.pop()
 
@@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
         >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
         'Example'
 
-        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
+        >>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
         'regexp: (?<![a-zA-Z]'
+
+        >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
+        'Lorem ipsum </i>dolor sit amet</p>'
+
+        >>> html_to_text(r'&#x3e &#x3c &#97')
+        '> < a'
+
     """
     if not html_str:
         return ""
     html_str = html_str.replace('\n', ' ').replace('\r', ' ')
     html_str = ' '.join(html_str.split())
-    s = _HTMLTextExtractor()
+    s = HTMLTextExtractor()
     try:
         s.feed(html_str)
         s.close()
     except AssertionError:
-        s = _HTMLTextExtractor()
+        s = HTMLTextExtractor()
         s.feed(escape(html_str, quote=True))
         s.close()
-    except _HTMLTextExtractorException:
-        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
     return s.get_text()
 
 
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index ad6ca37a5..01056df74 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
         self.assertIsNotNone(utils.searxng_useragent())
         self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
 
-    def test_html_to_text(self):
-        html_str = """
-        <a href="/testlink" class="link_access_account">
-            <style>
-                .toto {
-                    color: red;
-                }
-            </style>
-            <span class="toto">
-                <span>
-                    <img src="test.jpg" />
-                </span>
-            </span>
-            <span class="titi">
-                            Test text
-            </span>
-            <script>value='dummy';</script>
-        </a>
-        """
-        self.assertIsInstance(utils.html_to_text(html_str), str)
-        self.assertIsNotNone(utils.html_to_text(html_str))
-        self.assertEqual(utils.html_to_text(html_str), "Test text")
-        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
-
     def test_extract_text(self):
         html_str = """
         <a href="/testlink" class="link_access_account">
@@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
         with self.assertRaises(Exception):
             utils.extract_url([], 'https://example.com')
 
-    def test_html_to_text_invalid(self):
-        _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
-        self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
-
     def test_ecma_unscape(self):
         self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
         self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
         self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
 
-
-class TestHTMLTextExtractor(SearxTestCase):  # pylint: disable=missing-class-docstring
-
-    def setUp(self):
-        super().setUp()
-
-        self.html_text_extractor = utils._HTMLTextExtractor()  # pylint: disable=protected-access
-
-    def test__init__(self):
-        self.assertEqual(self.html_text_extractor.result, [])
-
     @parameterized.expand(
         [
-            ('xF', '\x0f'),
-            ('XF', '\x0f'),
-            ('97', 'a'),
+            ('Example <span id="42">#2</span>', 'Example #2'),
+            ('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
+            (r'regexp: (?&lt;![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
+            (r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
+            (r'&#x3e &#x3c &#97', '> < a'),
         ]
     )
-    def test_handle_charref(self, charref: str, expected: str):
-        self.html_text_extractor.handle_charref(charref)
-        self.assertIn(expected, self.html_text_extractor.result)
+    def test_html_to_text(self, html_str: str, text_str: str):
+        self.assertEqual(utils.html_to_text(html_str), text_str)
 
-    def test_handle_entityref(self):
-        entity = 'test'
-        self.html_text_extractor.handle_entityref(entity)
-        self.assertIn(entity, self.html_text_extractor.result)
-
-    def test_invalid_html(self):
-        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
-        with self.assertRaises(utils._HTMLTextExtractorException):  # pylint: disable=protected-access
-            self.html_text_extractor.feed(text)
+    def test_html_to_text_with_a_style_span(self):
+        html_str = """
+        <a href="/testlink" class="link_access_account">
+            <style>
+                .toto {
+                    color: red;
+                }
+            </style>
+            <span class="toto">
+                <span>
+                    <img src="test.jpg" />
+                </span>
+            </span>
+            <span class="titi">
+                            Test text
+            </span>
+            <script>value='dummy';</script>
+        </a>
+        """
+        self.assertIsInstance(utils.html_to_text(html_str), str)
+        self.assertEqual(utils.html_to_text(html_str), "Test text")
 
 
 class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring