mirror of
https://github.com/searxng/searxng.git
synced 2025-09-06 02:08:32 +02:00
[fix] revision of utils.HTMLTextExtractor (#5125)
Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
This commit is contained in:
parent
b606103352
commit
4fb6105d69
2 changed files with 41 additions and 65 deletions
|
@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
|
|||
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
|
||||
|
||||
|
||||
class _HTMLTextExtractorException(Exception):
|
||||
"""Internal exception raised when the HTML is invalid"""
|
||||
|
||||
|
||||
class _HTMLTextExtractor(HTMLParser):
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
"""Internal class to extract text from HTML"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
|
|||
return
|
||||
|
||||
if tag != self.tags[-1]:
|
||||
raise _HTMLTextExtractorException()
|
||||
self.result.append(f"</{tag}>")
|
||||
return
|
||||
|
||||
self.tags.pop()
|
||||
|
||||
|
@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
|
|||
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
|
||||
'Example'
|
||||
|
||||
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
|
||||
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
|
||||
'regexp: (?<![a-zA-Z]'
|
||||
|
||||
>>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
|
||||
'Lorem ipsum </i>dolor sit amet</p>'
|
||||
|
||||
>>> html_to_text(r'> < a')
|
||||
'> < a'
|
||||
|
||||
"""
|
||||
if not html_str:
|
||||
return ""
|
||||
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
|
||||
html_str = ' '.join(html_str.split())
|
||||
s = _HTMLTextExtractor()
|
||||
s = HTMLTextExtractor()
|
||||
try:
|
||||
s.feed(html_str)
|
||||
s.close()
|
||||
except AssertionError:
|
||||
s = _HTMLTextExtractor()
|
||||
s = HTMLTextExtractor()
|
||||
s.feed(escape(html_str, quote=True))
|
||||
s.close()
|
||||
except _HTMLTextExtractorException:
|
||||
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
|
||||
return s.get_text()
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue