mirror of
https://github.com/searxng/searxng.git
synced 2025-07-12 15:59:21 +02:00
[mod] https rewrite pluginification
This commit is contained in:
parent
146928a749
commit
d2a636f75d
41 changed files with 29 additions and 26 deletions
227
searx/plugins/https_rewrite.py
Normal file
227
searx/plugins/https_rewrite.py
Normal file
|
@ -0,0 +1,227 @@
|
|||
'''
|
||||
searx is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
searx is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
||||
|
||||
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
||||
'''
|
||||
|
||||
import re
|
||||
from urlparse import urlparse
|
||||
from lxml import etree
|
||||
from os import listdir, environ
|
||||
from os.path import isfile, isdir, join
|
||||
from searx.plugins import logger
|
||||
from flask.ext.babel import gettext
|
||||
from searx import searx_dir
|
||||
|
||||
|
||||
name = "HTTPS rewrite"
|
||||
description = gettext('Rewrite HTTP links to HTTPS if possible')
|
||||
default_on = True
|
||||
|
||||
if 'SEARX_HTTPS_REWRITE_PATH' in environ:
|
||||
rules_path = environ['SEARX_rules_path']
|
||||
else:
|
||||
rules_path = join(searx_dir, 'plugins/https_rules')
|
||||
|
||||
logger = logger.getChild("https_rewrite")
|
||||
|
||||
# https://gitweb.torproject.org/\
|
||||
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
|
||||
|
||||
# HTTPS rewrite rules
|
||||
https_rules = []
|
||||
|
||||
|
||||
# load single ruleset from a xml file
|
||||
def load_single_https_ruleset(rules_path):
|
||||
ruleset = ()
|
||||
|
||||
# init parser
|
||||
parser = etree.XMLParser()
|
||||
|
||||
# load and parse xml-file
|
||||
try:
|
||||
tree = etree.parse(rules_path, parser)
|
||||
except:
|
||||
# TODO, error message
|
||||
return ()
|
||||
|
||||
# get root node
|
||||
root = tree.getroot()
|
||||
|
||||
# check if root is a node with the name ruleset
|
||||
# TODO improve parsing
|
||||
if root.tag != 'ruleset':
|
||||
return ()
|
||||
|
||||
# check if rule is deactivated by default
|
||||
if root.attrib.get('default_off'):
|
||||
return ()
|
||||
|
||||
# check if rule does only work for specific platforms
|
||||
if root.attrib.get('platform'):
|
||||
return ()
|
||||
|
||||
hosts = []
|
||||
rules = []
|
||||
exclusions = []
|
||||
|
||||
# parse childs from ruleset
|
||||
for ruleset in root:
|
||||
# this child define a target
|
||||
if ruleset.tag == 'target':
|
||||
# check if required tags available
|
||||
if not ruleset.attrib.get('host'):
|
||||
continue
|
||||
|
||||
# convert host-rule to valid regex
|
||||
host = ruleset.attrib.get('host')\
|
||||
.replace('.', '\.').replace('*', '.*')
|
||||
|
||||
# append to host list
|
||||
hosts.append(host)
|
||||
|
||||
# this child define a rule
|
||||
elif ruleset.tag == 'rule':
|
||||
# check if required tags available
|
||||
if not ruleset.attrib.get('from')\
|
||||
or not ruleset.attrib.get('to'):
|
||||
continue
|
||||
|
||||
# TODO hack, which convert a javascript regex group
|
||||
# into a valid python regex group
|
||||
rule_from = ruleset.attrib['from'].replace('$', '\\')
|
||||
if rule_from.endswith('\\'):
|
||||
rule_from = rule_from[:-1]+'$'
|
||||
rule_to = ruleset.attrib['to'].replace('$', '\\')
|
||||
if rule_to.endswith('\\'):
|
||||
rule_to = rule_to[:-1]+'$'
|
||||
|
||||
# TODO, not working yet because of the hack above,
|
||||
# currently doing that in webapp.py
|
||||
# rule_from_rgx = re.compile(rule_from, re.I)
|
||||
|
||||
# append rule
|
||||
try:
|
||||
rules.append((re.compile(rule_from, re.I | re.U), rule_to))
|
||||
except:
|
||||
# TODO log regex error
|
||||
continue
|
||||
|
||||
# this child define an exclusion
|
||||
elif ruleset.tag == 'exclusion':
|
||||
# check if required tags available
|
||||
if not ruleset.attrib.get('pattern'):
|
||||
continue
|
||||
|
||||
exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
|
||||
|
||||
# append exclusion
|
||||
exclusions.append(exclusion_rgx)
|
||||
|
||||
# convert list of possible hosts to a simple regex
|
||||
# TODO compress regex to improve performance
|
||||
try:
|
||||
target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
|
||||
except:
|
||||
return ()
|
||||
|
||||
# return ruleset
|
||||
return (target_hosts, rules, exclusions)
|
||||
|
||||
|
||||
# load all https rewrite rules
|
||||
def load_https_rules(rules_path):
|
||||
# check if directory exists
|
||||
if not isdir(rules_path):
|
||||
logger.error("directory not found: '" + rules_path + "'")
|
||||
return
|
||||
|
||||
# search all xml files which are stored in the https rule directory
|
||||
xml_files = [join(rules_path, f)
|
||||
for f in listdir(rules_path)
|
||||
if isfile(join(rules_path, f)) and f[-4:] == '.xml']
|
||||
|
||||
# load xml-files
|
||||
for ruleset_file in xml_files:
|
||||
# calculate rewrite-rules
|
||||
ruleset = load_single_https_ruleset(ruleset_file)
|
||||
|
||||
# skip if no ruleset returned
|
||||
if not ruleset:
|
||||
continue
|
||||
|
||||
# append ruleset
|
||||
https_rules.append(ruleset)
|
||||
|
||||
logger.info('{n} rules loaded'.format(n=len(https_rules)))
|
||||
|
||||
|
||||
def https_url_rewrite(result):
|
||||
skip_https_rewrite = False
|
||||
# check if HTTPS rewrite is possible
|
||||
for target, rules, exclusions in https_rules:
|
||||
|
||||
# check if target regex match with url
|
||||
if target.match(result['parsed_url'].netloc):
|
||||
# process exclusions
|
||||
for exclusion in exclusions:
|
||||
# check if exclusion match with url
|
||||
if exclusion.match(result['url']):
|
||||
skip_https_rewrite = True
|
||||
break
|
||||
|
||||
# skip https rewrite if required
|
||||
if skip_https_rewrite:
|
||||
break
|
||||
|
||||
# process rules
|
||||
for rule in rules:
|
||||
try:
|
||||
new_result_url = rule[0].sub(rule[1], result['url'])
|
||||
except:
|
||||
break
|
||||
|
||||
# parse new url
|
||||
new_parsed_url = urlparse(new_result_url)
|
||||
|
||||
# continiue if nothing was rewritten
|
||||
if result['url'] == new_result_url:
|
||||
continue
|
||||
|
||||
# get domainname from result
|
||||
# TODO, does only work correct with TLD's like
|
||||
# asdf.com, not for asdf.com.de
|
||||
# TODO, using publicsuffix instead of this rewrite rule
|
||||
old_result_domainname = '.'.join(
|
||||
result['parsed_url'].hostname.split('.')[-2:])
|
||||
new_result_domainname = '.'.join(
|
||||
new_parsed_url.hostname.split('.')[-2:])
|
||||
|
||||
# check if rewritten hostname is the same,
|
||||
# to protect against wrong or malicious rewrite rules
|
||||
if old_result_domainname == new_result_domainname:
|
||||
# set new url
|
||||
result['url'] = new_result_url
|
||||
|
||||
# target has matched, do not search over the other rules
|
||||
break
|
||||
return result
|
||||
|
||||
|
||||
def on_result(request, ctx):
|
||||
result = ctx['result']
|
||||
if result['parsed_url'].scheme == 'http':
|
||||
https_url_rewrite(result)
|
||||
return True
|
Loading…
Add table
Add a link
Reference in a new issue