mirror of
https://github.com/searxng/searxng.git
synced 2025-07-13 00:09:18 +02:00
[fix] highlighting only html
This commit is contained in:
parent
04c408389d
commit
7b4ec5c5e9
3 changed files with 35 additions and 28 deletions
|
@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
|
|||
import csv
|
||||
import codecs
|
||||
import cStringIO
|
||||
import re
|
||||
|
||||
def highlight_content(content, query):
|
||||
|
||||
if not content:
|
||||
return None
|
||||
# ignoring html contents
|
||||
# TODO better html content detection
|
||||
if content.find('<') != -1:
|
||||
return content
|
||||
|
||||
query = query.decode('utf-8')
|
||||
if content.lower().find(query.lower()) > -1:
|
||||
query_regex = u'({0})'.format(re.escape(query))
|
||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||
else:
|
||||
regex_parts = []
|
||||
for chunk in query.split():
|
||||
if len(chunk) == 1:
|
||||
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
|
||||
else:
|
||||
regex_parts.append(u'{0}'.format(re.escape(chunk)))
|
||||
query_regex = u'({0})'.format('|'.join(regex_parts))
|
||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||
|
||||
return content
|
||||
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue