diff --git a/searx/utils.py b/searx/utils.py index 54b32484e..dff3eb4f4 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str: return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])) -class _HTMLTextExtractorException(Exception): - """Internal exception raised when the HTML is invalid""" - - -class _HTMLTextExtractor(HTMLParser): +class HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): @@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser): return if tag != self.tags[-1]: - raise _HTMLTextExtractorException() + self.result.append(f"") + return self.tags.pop() @@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('Example') 'Example' - >>> html_to_text(r'regexp: (?>> html_to_text(r'regexp: (?<![a-zA-Z]') 'regexp: (?>> html_to_text(r'

Lorem ipsum dolor sit amet

') + 'Lorem ipsum dolor sit amet

' + + >>> html_to_text(r'> < a') + '> < a' + """ if not html_str: return "" html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) - s = _HTMLTextExtractor() + s = HTMLTextExtractor() try: s.feed(html_str) s.close() except AssertionError: - s = _HTMLTextExtractor() + s = HTMLTextExtractor() s.feed(escape(html_str, quote=True)) s.close() - except _HTMLTextExtractorException: - logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) return s.get_text() diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ad6ca37a5..01056df74 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -28,30 +28,6 @@ class TestUtils(SearxTestCase): self.assertIsNotNone(utils.searxng_useragent()) self.assertTrue(utils.searxng_useragent().startswith('SearXNG')) - def test_html_to_text(self): - html_str = """ - - - - - - - - - Test text - - - - """ - self.assertIsInstance(utils.html_to_text(html_str), str) - self.assertIsNotNone(utils.html_to_text(html_str)) - self.assertEqual(utils.html_to_text(html_str), "Test text") - self.assertEqual(utils.html_to_text(r"regexp: (? @@ -99,46 +75,44 @@ class TestUtils(SearxTestCase): with self.assertRaises(Exception): utils.extract_url([], 'https://example.com') - def test_html_to_text_invalid(self): - _html = '

Lorem ipsumdolor sit amet

' - self.assertEqual(utils.html_to_text(_html), "Lorem ipsum") - def test_ecma_unscape(self): self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界') - -class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring - - def setUp(self): - super().setUp() - - self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access - - def test__init__(self): - self.assertEqual(self.html_text_extractor.result, []) - @parameterized.expand( [ - ('xF', '\x0f'), - ('XF', '\x0f'), - ('97', 'a'), + ('Example #2', 'Example #2'), + ('Example', 'Example'), + (r'regexp: (?<![a-zA-Z]', r'regexp: (?Lorem ipsum dolor sit amet

', 'Lorem ipsum dolor sit amet

'), + (r'> < a', '> < a'), ] ) - def test_handle_charref(self, charref: str, expected: str): - self.html_text_extractor.handle_charref(charref) - self.assertIn(expected, self.html_text_extractor.result) + def test_html_to_text(self, html_str: str, text_str: str): + self.assertEqual(utils.html_to_text(html_str), text_str) - def test_handle_entityref(self): - entity = 'test' - self.html_text_extractor.handle_entityref(entity) - self.assertIn(entity, self.html_text_extractor.result) - - def test_invalid_html(self): - text = '

Lorem ipsumdolor sit amet

' - with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access - self.html_text_extractor.feed(text) + def test_html_to_text_with_a_style_span(self): + html_str = """ + + """ + self.assertIsInstance(utils.html_to_text(html_str), str) + self.assertEqual(utils.html_to_text(html_str), "Test text") class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring