mirror of
https://github.com/searxng/searxng.git
synced 2025-09-06 18:28:32 +02:00
[fix] revision of utils.HTMLTextExtractor (#5125)
Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
This commit is contained in:
parent
b606103352
commit
4fb6105d69
2 changed files with 41 additions and 65 deletions
|
@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
|
||||||
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
|
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
|
||||||
|
|
||||||
|
|
||||||
class _HTMLTextExtractorException(Exception):
|
class HTMLTextExtractor(HTMLParser):
|
||||||
"""Internal exception raised when the HTML is invalid"""
|
|
||||||
|
|
||||||
|
|
||||||
class _HTMLTextExtractor(HTMLParser):
|
|
||||||
"""Internal class to extract text from HTML"""
|
"""Internal class to extract text from HTML"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
if tag != self.tags[-1]:
|
if tag != self.tags[-1]:
|
||||||
raise _HTMLTextExtractorException()
|
self.result.append(f"</{tag}>")
|
||||||
|
return
|
||||||
|
|
||||||
self.tags.pop()
|
self.tags.pop()
|
||||||
|
|
||||||
|
@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
|
||||||
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
|
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
|
||||||
'Example'
|
'Example'
|
||||||
|
|
||||||
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
|
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
|
||||||
'regexp: (?<![a-zA-Z]'
|
'regexp: (?<![a-zA-Z]'
|
||||||
|
|
||||||
|
>>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
|
||||||
|
'Lorem ipsum </i>dolor sit amet</p>'
|
||||||
|
|
||||||
|
>>> html_to_text(r'> < a')
|
||||||
|
'> < a'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if not html_str:
|
if not html_str:
|
||||||
return ""
|
return ""
|
||||||
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
|
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
|
||||||
html_str = ' '.join(html_str.split())
|
html_str = ' '.join(html_str.split())
|
||||||
s = _HTMLTextExtractor()
|
s = HTMLTextExtractor()
|
||||||
try:
|
try:
|
||||||
s.feed(html_str)
|
s.feed(html_str)
|
||||||
s.close()
|
s.close()
|
||||||
except AssertionError:
|
except AssertionError:
|
||||||
s = _HTMLTextExtractor()
|
s = HTMLTextExtractor()
|
||||||
s.feed(escape(html_str, quote=True))
|
s.feed(escape(html_str, quote=True))
|
||||||
s.close()
|
s.close()
|
||||||
except _HTMLTextExtractorException:
|
|
||||||
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
|
|
||||||
return s.get_text()
|
return s.get_text()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
|
||||||
self.assertIsNotNone(utils.searxng_useragent())
|
self.assertIsNotNone(utils.searxng_useragent())
|
||||||
self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
|
self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
|
||||||
|
|
||||||
def test_html_to_text(self):
|
|
||||||
html_str = """
|
|
||||||
<a href="/testlink" class="link_access_account">
|
|
||||||
<style>
|
|
||||||
.toto {
|
|
||||||
color: red;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
<span class="toto">
|
|
||||||
<span>
|
|
||||||
<img src="test.jpg" />
|
|
||||||
</span>
|
|
||||||
</span>
|
|
||||||
<span class="titi">
|
|
||||||
Test text
|
|
||||||
</span>
|
|
||||||
<script>value='dummy';</script>
|
|
||||||
</a>
|
|
||||||
"""
|
|
||||||
self.assertIsInstance(utils.html_to_text(html_str), str)
|
|
||||||
self.assertIsNotNone(utils.html_to_text(html_str))
|
|
||||||
self.assertEqual(utils.html_to_text(html_str), "Test text")
|
|
||||||
self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
|
|
||||||
|
|
||||||
def test_extract_text(self):
|
def test_extract_text(self):
|
||||||
html_str = """
|
html_str = """
|
||||||
<a href="/testlink" class="link_access_account">
|
<a href="/testlink" class="link_access_account">
|
||||||
|
@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
|
||||||
with self.assertRaises(Exception):
|
with self.assertRaises(Exception):
|
||||||
utils.extract_url([], 'https://example.com')
|
utils.extract_url([], 'https://example.com')
|
||||||
|
|
||||||
def test_html_to_text_invalid(self):
|
|
||||||
_html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
|
||||||
self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
|
|
||||||
|
|
||||||
def test_ecma_unscape(self):
|
def test_ecma_unscape(self):
|
||||||
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
|
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
|
||||||
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
|
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
|
||||||
self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
|
self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
|
||||||
|
|
||||||
|
|
||||||
class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
super().setUp()
|
|
||||||
|
|
||||||
self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access
|
|
||||||
|
|
||||||
def test__init__(self):
|
|
||||||
self.assertEqual(self.html_text_extractor.result, [])
|
|
||||||
|
|
||||||
@parameterized.expand(
|
@parameterized.expand(
|
||||||
[
|
[
|
||||||
('xF', '\x0f'),
|
('Example <span id="42">#2</span>', 'Example #2'),
|
||||||
('XF', '\x0f'),
|
('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
|
||||||
('97', 'a'),
|
(r'regexp: (?<![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
|
||||||
|
(r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
|
||||||
|
(r'> < a', '> < a'),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_handle_charref(self, charref: str, expected: str):
|
def test_html_to_text(self, html_str: str, text_str: str):
|
||||||
self.html_text_extractor.handle_charref(charref)
|
self.assertEqual(utils.html_to_text(html_str), text_str)
|
||||||
self.assertIn(expected, self.html_text_extractor.result)
|
|
||||||
|
|
||||||
def test_handle_entityref(self):
|
def test_html_to_text_with_a_style_span(self):
|
||||||
entity = 'test'
|
html_str = """
|
||||||
self.html_text_extractor.handle_entityref(entity)
|
<a href="/testlink" class="link_access_account">
|
||||||
self.assertIn(entity, self.html_text_extractor.result)
|
<style>
|
||||||
|
.toto {
|
||||||
def test_invalid_html(self):
|
color: red;
|
||||||
text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
}
|
||||||
with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access
|
</style>
|
||||||
self.html_text_extractor.feed(text)
|
<span class="toto">
|
||||||
|
<span>
|
||||||
|
<img src="test.jpg" />
|
||||||
|
</span>
|
||||||
|
</span>
|
||||||
|
<span class="titi">
|
||||||
|
Test text
|
||||||
|
</span>
|
||||||
|
<script>value='dummy';</script>
|
||||||
|
</a>
|
||||||
|
"""
|
||||||
|
self.assertIsInstance(utils.html_to_text(html_str), str)
|
||||||
|
self.assertEqual(utils.html_to_text(html_str), "Test text")
|
||||||
|
|
||||||
|
|
||||||
class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring
|
class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue