[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception

Close #2188
This commit is contained in:
Alexandre Flament 2020-09-11 10:23:56 +02:00
parent ae07f4a211
commit 6deb85072a
2 changed files with 18 additions and 2 deletions

View file

@ -77,6 +77,10 @@ def highlight_content(content, query):
return content
class HTMLTextExtractorException(Exception):
pass
class HTMLTextExtractor(HTMLParser):
def __init__(self):
@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
return
if tag != self.tags[-1]:
raise Exception("invalid html")
raise HTMLTextExtractorException()
self.tags.pop()
@ -128,7 +132,10 @@ def html_to_text(html):
html = html.replace('\n', ' ')
html = ' '.join(html.split())
s = HTMLTextExtractor()
s.feed(html)
try:
s.feed(html)
except HTMLTextExtractorException:
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
return s.get_text()