[fix] highlighting only html

This commit is contained in:
asciimoo 2014-01-10 23:38:08 +01:00
parent 04c408389d
commit 7b4ec5c5e9
3 changed files with 35 additions and 28 deletions

View file

@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
import csv
import codecs
import cStringIO
import re
def highlight_content(content, query):
if not content:
return None
# ignoring html contents
# TODO better html content detection
if content.find('<') != -1:
return content
query = query.decode('utf-8')
if content.lower().find(query.lower()) > -1:
query_regex = u'({0})'.format(re.escape(query))
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
else:
regex_parts = []
for chunk in query.split():
if len(chunk) == 1:
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
else:
regex_parts.append(u'{0}'.format(re.escape(chunk)))
query_regex = u'({0})'.format('|'.join(regex_parts))
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
return content
class HTMLTextExtractor(HTMLParser):
def __init__(self):