[mod] https rewrite pluginification

2025-07-12 15:59:21 +02:00 · 2015-04-13 00:30:12 +02:00 · 2015-04-13 00:30:12 +02:00 · d2a636f75d
commit d2a636f75d
parent 146928a749
41 changed files with 29 additions and 26 deletions
--- a/searx/plugins/https_rewrite.py
+++ b/searx/plugins/https_rewrite.py
@ -0,0 +1,227 @@
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
+'''
+
+import re
+from urlparse import urlparse
+from lxml import etree
+from os import listdir, environ
+from os.path import isfile, isdir, join
+from searx.plugins import logger
+from flask.ext.babel import gettext
+from searx import searx_dir
+
+
+name = "HTTPS rewrite"
+description = gettext('Rewrite HTTP links to HTTPS if possible')
+default_on = True
+
+if 'SEARX_HTTPS_REWRITE_PATH' in environ:
+    rules_path = environ['SEARX_rules_path']
+else:
+    rules_path = join(searx_dir, 'plugins/https_rules')
+
+logger = logger.getChild("https_rewrite")
+
+# https://gitweb.torproject.org/\
+# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
+
+# HTTPS rewrite rules
+https_rules = []
+
+
+# load single ruleset from a xml file
+def load_single_https_ruleset(rules_path):
+    ruleset = ()
+
+    # init parser
+    parser = etree.XMLParser()
+
+    # load and parse xml-file
+    try:
+        tree = etree.parse(rules_path, parser)
+    except:
+        # TODO, error message
+        return ()
+
+    # get root node
+    root = tree.getroot()
+
+    # check if root is a node with the name ruleset
+    # TODO improve parsing
+    if root.tag != 'ruleset':
+        return ()
+
+    # check if rule is deactivated by default
+    if root.attrib.get('default_off'):
+        return ()
+
+    # check if rule does only work for specific platforms
+    if root.attrib.get('platform'):
+        return ()
+
+    hosts = []
+    rules = []
+    exclusions = []
+
+    # parse childs from ruleset
+    for ruleset in root:
+        # this child define a target
+        if ruleset.tag == 'target':
+            # check if required tags available
+            if not ruleset.attrib.get('host'):
+                continue
+
+            # convert host-rule to valid regex
+            host = ruleset.attrib.get('host')\
+                .replace('.', '\.').replace('*', '.*')
+
+            # append to host list
+            hosts.append(host)
+
+        # this child define a rule
+        elif ruleset.tag == 'rule':
+            # check if required tags available
+            if not ruleset.attrib.get('from')\
+               or not ruleset.attrib.get('to'):
+                continue
+
+            # TODO hack, which convert a javascript regex group
+            # into a valid python regex group
+            rule_from = ruleset.attrib['from'].replace('$', '\\')
+            if rule_from.endswith('\\'):
+                rule_from = rule_from[:-1]+'$'
+            rule_to = ruleset.attrib['to'].replace('$', '\\')
+            if rule_to.endswith('\\'):
+                rule_to = rule_to[:-1]+'$'
+
+            # TODO, not working yet because of the hack above,
+            # currently doing that in webapp.py
+            # rule_from_rgx = re.compile(rule_from, re.I)
+
+            # append rule
+            try:
+                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
+            except:
+                # TODO log regex error
+                continue
+
+        # this child define an exclusion
+        elif ruleset.tag == 'exclusion':
+            # check if required tags available
+            if not ruleset.attrib.get('pattern'):
+                continue
+
+            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
+
+            # append exclusion
+            exclusions.append(exclusion_rgx)
+
+    # convert list of possible hosts to a simple regex
+    # TODO compress regex to improve performance
+    try:
+        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
+    except:
+        return ()
+
+    # return ruleset
+    return (target_hosts, rules, exclusions)
+
+
+# load all https rewrite rules
+def load_https_rules(rules_path):
+    # check if directory exists
+    if not isdir(rules_path):
+        logger.error("directory not found: '" + rules_path + "'")
+        return
+
+    # search all xml files which are stored in the https rule directory
+    xml_files = [join(rules_path, f)
+                 for f in listdir(rules_path)
+                 if isfile(join(rules_path, f)) and f[-4:] == '.xml']
+
+    # load xml-files
+    for ruleset_file in xml_files:
+        # calculate rewrite-rules
+        ruleset = load_single_https_ruleset(ruleset_file)
+
+        # skip if no ruleset returned
+        if not ruleset:
+            continue
+
+        # append ruleset
+        https_rules.append(ruleset)
+
+    logger.info('{n} rules loaded'.format(n=len(https_rules)))
+
+
+def https_url_rewrite(result):
+    skip_https_rewrite = False
+    # check if HTTPS rewrite is possible
+    for target, rules, exclusions in https_rules:
+
+        # check if target regex match with url
+        if target.match(result['parsed_url'].netloc):
+            # process exclusions
+            for exclusion in exclusions:
+                # check if exclusion match with url
+                if exclusion.match(result['url']):
+                    skip_https_rewrite = True
+                    break
+
+            # skip https rewrite if required
+            if skip_https_rewrite:
+                break
+
+            # process rules
+            for rule in rules:
+                try:
+                    new_result_url = rule[0].sub(rule[1], result['url'])
+                except:
+                    break
+
+                # parse new url
+                new_parsed_url = urlparse(new_result_url)
+
+                # continiue if nothing was rewritten
+                if result['url'] == new_result_url:
+                    continue
+
+                # get domainname from result
+                # TODO, does only work correct with TLD's like
+                #  asdf.com, not for asdf.com.de
+                # TODO, using publicsuffix instead of this rewrite rule
+                old_result_domainname = '.'.join(
+                    result['parsed_url'].hostname.split('.')[-2:])
+                new_result_domainname = '.'.join(
+                    new_parsed_url.hostname.split('.')[-2:])
+
+                # check if rewritten hostname is the same,
+                # to protect against wrong or malicious rewrite rules
+                if old_result_domainname == new_result_domainname:
+                    # set new url
+                    result['url'] = new_result_url
+
+            # target has matched, do not search over the other rules
+            break
+    return result
+
+
+def on_result(request, ctx):
+    result = ctx['result']
+    if result['parsed_url'].scheme == 'http':
+        https_url_rewrite(result)
+    return True