[fix] revision of utils.HTMLTextExtractor (#5125)

Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
2025-09-06 18:28:32 +02:00 · 2025-08-18 16:30:51 +02:00 · 2025-08-18 16:30:51 +02:00 · 4fb6105d69
commit 4fb6105d69
parent b606103352
2 changed files with 41 additions and 65 deletions
--- a/searx/utils.py
+++ b/searx/utils.py
@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
    return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
-class _HTMLTextExtractorException(Exception):
+class HTMLTextExtractor(HTMLParser):
    """Internal exception raised when the HTML is invalid"""
 class _HTMLTextExtractor(HTMLParser):
    """Internal class to extract text from HTML"""
    def __init__(self):
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
            return
        if tag != self.tags[-1]:
-            raise _HTMLTextExtractorException()
+            self.result.append(f"</{tag}>")
            return
        self.tags.pop()
@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
        >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
        'Example'
-        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
+        >>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
        'regexp: (?<![a-zA-Z]'
        >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
        'Lorem ipsum </i>dolor sit amet</p>'
        >>> html_to_text(r'&#x3e &#x3c &#97')
        '> < a'
    """
    if not html_str:
        return ""
    html_str = html_str.replace('\n', ' ').replace('\r', ' ')
    html_str = ' '.join(html_str.split())
-    s = _HTMLTextExtractor()
+    s = HTMLTextExtractor()
    try:
        s.feed(html_str)
        s.close()
    except AssertionError:
-        s = _HTMLTextExtractor()
+        s = HTMLTextExtractor()
        s.feed(escape(html_str, quote=True))
        s.close()
    except _HTMLTextExtractorException:
        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
    return s.get_text()
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
        self.assertIsNotNone(utils.searxng_useragent())
        self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
    def test_html_to_text(self):
        html_str = """
        <a href="/testlink" class="link_access_account">
            <style>
                .toto {
                    color: red;
                }
            </style>
            <span class="toto">
                <span>
                    <img src="test.jpg" />
                </span>
            </span>
            <span class="titi">
                            Test text
            </span>
            <script>value='dummy';</script>
        </a>
        """
        self.assertIsInstance(utils.html_to_text(html_str), str)
        self.assertIsNotNone(utils.html_to_text(html_str))
        self.assertEqual(utils.html_to_text(html_str), "Test text")
        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
    def test_extract_text(self):
        html_str = """
        <a href="/testlink" class="link_access_account">
@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
        with self.assertRaises(Exception):
            utils.extract_url([], 'https://example.com')
    def test_html_to_text_invalid(self):
        _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
        self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
    def test_ecma_unscape(self):
        self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
        self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
        self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
 class TestHTMLTextExtractor(SearxTestCase):  # pylint: disable=missing-class-docstring
    def setUp(self):
        super().setUp()
        self.html_text_extractor = utils._HTMLTextExtractor()  # pylint: disable=protected-access
    def test__init__(self):
        self.assertEqual(self.html_text_extractor.result, [])
    @parameterized.expand(
        [
-            ('xF', '\x0f'),
+            ('Example <span id="42">#2</span>', 'Example #2'),
-            ('XF', '\x0f'),
+            ('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
-            ('97', 'a'),
+            (r'regexp: (?&lt;![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
            (r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
            (r'&#x3e &#x3c &#97', '> < a'),
        ]
    )
-    def test_handle_charref(self, charref: str, expected: str):
+    def test_html_to_text(self, html_str: str, text_str: str):
-        self.html_text_extractor.handle_charref(charref)
+        self.assertEqual(utils.html_to_text(html_str), text_str)
        self.assertIn(expected, self.html_text_extractor.result)
-    def test_handle_entityref(self):
+    def test_html_to_text_with_a_style_span(self):
-        entity = 'test'
+        html_str = """
-        self.html_text_extractor.handle_entityref(entity)
+        <a href="/testlink" class="link_access_account">
-        self.assertIn(entity, self.html_text_extractor.result)
+            <style>
-
+                .toto {
-    def test_invalid_html(self):
+                    color: red;
-        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
+                }
-        with self.assertRaises(utils._HTMLTextExtractorException):  # pylint: disable=protected-access
+            </style>
-            self.html_text_extractor.feed(text)
+            <span class="toto">
                <span>
                    <img src="test.jpg" />
                </span>
            </span>
            <span class="titi">
                            Test text
            </span>
            <script>value='dummy';</script>
        </a>
        """
        self.assertIsInstance(utils.html_to_text(html_str), str)
        self.assertEqual(utils.html_to_text(html_str), "Test text")
 class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring