[fix] revision of utils.HTMLTextExtractor (#5125)

Related:

- https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
This commit is contained in:
Markus Heiser 2025-08-18 16:30:51 +02:00 committed by GitHub
parent b606103352
commit 4fb6105d69
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 41 additions and 65 deletions

View file

@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
class _HTMLTextExtractorException(Exception):
"""Internal exception raised when the HTML is invalid"""
class _HTMLTextExtractor(HTMLParser):
class HTMLTextExtractor(HTMLParser):
"""Internal class to extract text from HTML"""
def __init__(self):
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
return
if tag != self.tags[-1]:
raise _HTMLTextExtractorException()
self.result.append(f"</{tag}>")
return
self.tags.pop()
@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example'
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
>>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
'regexp: (?<![a-zA-Z]'
>>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
'Lorem ipsum </i>dolor sit amet</p>'
>>> html_to_text(r'&#x3e &#x3c &#97')
'> < a'
"""
if not html_str:
return ""
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor()
s = HTMLTextExtractor()
try:
s.feed(html_str)
s.close()
except AssertionError:
s = _HTMLTextExtractor()
s = HTMLTextExtractor()
s.feed(escape(html_str, quote=True))
s.close()
except _HTMLTextExtractorException:
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
return s.get_text()