From ce8929cabe27c7cf0bfb21b47786c7442ffb3712 Mon Sep 17 00:00:00 2001 From: Ivan Gabaldon Date: Sat, 9 Aug 2025 23:03:30 +0200 Subject: [PATCH] [mod] limiter: trusted proxies (#4911) Replaces `x_for` functionality with `trusted_proxies`. This allows defining which IP / ranges to trust extracting the client IP address from X-Forwarded-For and X-Real-IP headers. We don't know if the proxy chain will give us the proper client address (REMOTE_ADDR in the WSGI environment), so we rely on reading the headers of the proxy before SearXNG (if there is one, in that case it must be added to trusted_proxies) hoping it has done the proper checks. In case a proxy in the chain does not check the client address correctly, integrity is compromised and this should be fixed by whoever manages the proxy, not us. Closes: - https://github.com/searxng/searxng/issues/4940 - https://github.com/searxng/searxng/issues/4939 - https://github.com/searxng/searxng/issues/4907 - https://github.com/searxng/searxng/issues/3632 - https://github.com/searxng/searxng/issues/3191 - https://github.com/searxng/searxng/issues/1237 Related: - https://github.com/searxng/searxng-docker/issues/386 - https://github.com/inetol-infrastructure/searxng-container/issues/81 --- searx/botdetection/__init__.py | 23 +-- searx/botdetection/_helpers.py | 99 +++--------- searx/botdetection/config.py | 33 ++-- searx/botdetection/http_accept.py | 5 +- searx/botdetection/http_accept_encoding.py | 5 +- searx/botdetection/http_accept_language.py | 5 +- searx/botdetection/http_connection.py | 5 +- searx/botdetection/http_sec_fetch.py | 4 +- searx/botdetection/http_user_agent.py | 5 +- searx/botdetection/ip_limit.py | 7 +- searx/botdetection/ip_lists.py | 16 +- searx/botdetection/link_token.py | 42 ++--- searx/botdetection/trusted_proxies.py | 175 +++++++++++++++++++++ searx/botdetection/valkeydb.py | 22 +++ searx/compat.py | 35 +++++ searx/flaskfix.py | 5 +- searx/limiter.py | 17 +- searx/limiter.toml | 21 ++- searx/plugins/self_info.py | 9 +- searx/plugins/tor_check.py | 4 +- searx/valkeydb.py | 7 +- searx/webapp.py | 3 +- tests/__init__.py | 2 +- tests/unit/test_plugin_self_info.py | 88 +++++++++-- 24 files changed, 453 insertions(+), 184 deletions(-) create mode 100644 searx/botdetection/trusted_proxies.py create mode 100644 searx/botdetection/valkeydb.py diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index 4079d97a9..e686e3de9 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -4,19 +4,22 @@ Implementations used for bot detection. """ +from __future__ import annotations + +__all__ = ["init", "dump_request", "get_network", "too_many_requests", "ProxyFix"] + + +import valkey from ._helpers import dump_request -from ._helpers import get_real_ip from ._helpers import get_network from ._helpers import too_many_requests - -__all__ = ['dump_request', 'get_network', 'get_real_ip', 'too_many_requests'] - -valkey_client = None -cfg = None +from . import config +from . import valkeydb +from .trusted_proxies import ProxyFix -def init(_cfg, _valkey_client): - global valkey_client, cfg # pylint: disable=global-statement - valkey_client = _valkey_client - cfg = _cfg +def init(cfg: config.Config, valkey_client: valkey.Valkey | None): + config.set_global_cfg(cfg) + if valkey_client: + valkeydb.set_valkey_client(valkey_client) diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py index 7b57ae694..72af693c1 100644 --- a/searx/botdetection/_helpers.py +++ b/searx/botdetection/_helpers.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=missing-module-docstring, invalid-name from __future__ import annotations +import typing as t + +__all__ = ["log_error_only_once", "dump_request", "get_network", "logger", "too_many_requests"] from ipaddress import ( IPv4Network, @@ -8,20 +11,19 @@ from ipaddress import ( IPv4Address, IPv6Address, ip_network, - ip_address, ) import flask import werkzeug from searx import logger -from searx.extended_types import SXNG_Request -from . import config +if t.TYPE_CHECKING: + from . import config logger = logger.getChild('botdetection') -def dump_request(request: SXNG_Request): +def dump_request(request: flask.Request): return ( request.path + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') @@ -52,86 +54,33 @@ def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkz def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network: - """Returns the (client) network of whether the real_ip is part of.""" + """Returns the (client) network of whether the ``real_ip`` is part of. + The ``ipv4_prefix`` and ``ipv6_prefix`` define the number of leading bits in + an address that are compared to determine whether or not an address is part + of a (client) network. + + .. code:: toml + + [botdetection] + + ipv4_prefix = 32 + ipv6_prefix = 48 + + """ + + prefix: int = cfg["botdetection.ipv4_prefix"] if real_ip.version == 6: - prefix = cfg['real_ip.ipv6_prefix'] - else: - prefix = cfg['real_ip.ipv4_prefix'] + prefix: int = cfg["botdetection.ipv6_prefix"] network = ip_network(f"{real_ip}/{prefix}", strict=False) # logger.debug("get_network(): %s", network.compressed) return network -_logged_errors = [] +_logged_errors: list[str] = [] -def _log_error_only_once(err_msg): +def log_error_only_once(err_msg: str): if err_msg not in _logged_errors: logger.error(err_msg) _logged_errors.append(err_msg) - - -def get_real_ip(request: SXNG_Request) -> str: - """Returns real IP of the request. Since not all proxies set all the HTTP - headers and incoming headers can be faked it may happen that the IP cannot - be determined correctly. - - .. sidebar:: :py:obj:`flask.Request.remote_addr` - - SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). - - This function tries to get the remote IP in the order listed below, - additional some tests are done and if inconsistencies or errors are - detected, they are logged. - - The remote IP of the request is taken from (first match): - - - X-Forwarded-For_ header - - `X-real-IP header `__ - - :py:obj:`flask.Request.remote_addr` - - .. _ProxyFix: - https://werkzeug.palletsprojects.com/middleware/proxy_fix/ - - .. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - - """ - - forwarded_for = request.headers.get("X-Forwarded-For") - real_ip = request.headers.get('X-Real-IP') - remote_addr = request.remote_addr - # logger.debug( - # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr - # ) - - if not forwarded_for: - _log_error_only_once("X-Forwarded-For header is not set!") - else: - from . import cfg # pylint: disable=import-outside-toplevel, cyclic-import - - forwarded_for = [x.strip() for x in forwarded_for.split(',')] - x_for: int = cfg['real_ip.x_for'] # type: ignore - forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] - - if not real_ip: - _log_error_only_once("X-Real-IP header is not set!") - - if forwarded_for and real_ip and forwarded_for != real_ip: - logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) - - if forwarded_for and remote_addr and forwarded_for != remote_addr: - logger.warning( - "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for - ) - - if real_ip and remote_addr and real_ip != remote_addr: - logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) - - request_ip = ip_address(forwarded_for or real_ip or remote_addr or '0.0.0.0') - if request_ip.version == 6 and request_ip.ipv4_mapped: - request_ip = request_ip.ipv4_mapped - - # logger.debug("get_real_ip() -> %s", request_ip) - return str(request_ip) diff --git a/searx/botdetection/config.py b/searx/botdetection/config.py index 5b73afe1c..6b35df84f 100644 --- a/searx/botdetection/config.py +++ b/searx/botdetection/config.py @@ -7,19 +7,32 @@ structured dictionaries. The configuration schema is defined in a dictionary structure and the configuration data is given in a dictionary structure. """ from __future__ import annotations -from typing import Any +import typing import copy -import typing import logging import pathlib from ..compat import tomllib -__all__ = ['Config', 'UNSET', 'SchemaIssue'] +__all__ = ['Config', 'UNSET', 'SchemaIssue', 'set_global_cfg', 'get_global_cfg'] log = logging.getLogger(__name__) +CFG: Config | None = None +"""Global config of the botdetection.""" + + +def set_global_cfg(cfg: Config): + global CFG # pylint: disable=global-statement + CFG = cfg + + +def get_global_cfg() -> Config: + if CFG is None: + raise ValueError("Botdetection's config is not yet initialized.") + return CFG + class FALSE: """Class of ``False`` singleton""" @@ -57,7 +70,7 @@ class Config: UNSET = UNSET @classmethod - def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config: + def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict[str, str]) -> Config: # init schema @@ -80,7 +93,7 @@ class Config: cfg.update(upd_cfg) return cfg - def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]): + def __init__(self, cfg_schema: dict[str, typing.Any], deprecated: dict[str, str]): """Constructor of class Config. :param cfg_schema: Schema of the configuration @@ -93,10 +106,10 @@ class Config: self.deprecated = deprecated self.cfg = copy.deepcopy(cfg_schema) - def __getitem__(self, key: str) -> Any: + def __getitem__(self, key: str) -> typing.Any: return self.get(key) - def validate(self, cfg: dict): + def validate(self, cfg: dict[str, typing.Any]): """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`. Validation is done by :py:obj:`validate`.""" @@ -111,7 +124,7 @@ class Config: """Returns default value of field ``name`` in ``self.cfg_schema``.""" return value(name, self.cfg_schema) - def get(self, name: str, default: Any = UNSET, replace: bool = True) -> Any: + def get(self, name: str, default: typing.Any = UNSET, replace: bool = True) -> typing.Any: """Returns the value to which ``name`` points in the configuration. If there is no such ``name`` in the config and the ``default`` is @@ -214,8 +227,8 @@ def value(name: str, data_dict: dict): def validate( - schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str] -) -> typing.Tuple[bool, list]: + schema_dict: dict[str, typing.Any], data_dict: dict[str, typing.Any], deprecated: dict[str, str] +) -> tuple[bool, list[str]]: """Deep validation of dictionary in ``data_dict`` against dictionary in ``schema_dict``. Argument deprecated is a dictionary that maps deprecated configuration names to a messages:: diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index f64991d50..4543e7217 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -20,8 +20,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -29,7 +28,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 0975cc85e..3cc65ee17 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -21,8 +21,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -30,7 +29,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 1287e5453..6e7480728 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -18,8 +18,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -27,7 +26,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: if request.headers.get('Accept-Language', '').strip() == '': diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index eed15f989..6adcd4b39 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -18,8 +18,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -27,7 +26,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_sec_fetch.py b/searx/botdetection/http_sec_fetch.py index f64ee4b2c..edead3bfa 100644 --- a/searx/botdetection/http_sec_fetch.py +++ b/searx/botdetection/http_sec_fetch.py @@ -32,8 +32,6 @@ import re import flask import werkzeug -from searx.extended_types import SXNG_Request - from . import config from ._helpers import logger @@ -78,7 +76,7 @@ def is_browser_supported(user_agent: str) -> bool: def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 57d5bfee7..9b28660fe 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -20,8 +20,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -56,7 +55,7 @@ def regexp_user_agent(): def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 93af8b7c5..2b216baf7 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -45,12 +45,11 @@ from ipaddress import ( import flask import werkzeug -from searx.extended_types import SXNG_Request -from searx import valkeydb from searx.valkeylib import incr_sliding_window, drop_counter from . import link_token from . import config +from . import valkeydb from ._helpers import ( too_many_requests, logger, @@ -92,12 +91,12 @@ SUSPICIOUS_IP_MAX = 3 def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, ) -> werkzeug.Response | None: # pylint: disable=too-many-return-statements - valkey_client = valkeydb.client() + valkey_client = valkeydb.get_valkey_client() if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) diff --git a/searx/botdetection/ip_lists.py b/searx/botdetection/ip_lists.py index 2ad1c62d0..77628b577 100644 --- a/searx/botdetection/ip_lists.py +++ b/searx/botdetection/ip_lists.py @@ -4,21 +4,22 @@ Method ``ip_lists`` ------------------- -The ``ip_lists`` method implements IP :py:obj:`block- ` and -:py:obj:`pass-lists `. +The ``ip_lists`` method implements :py:obj:`block-list ` and +:py:obj:`pass-list `. .. code:: toml [botdetection.ip_lists] pass_ip = [ - '167.235.158.251', # IPv4 of check.searx.space - '192.168.0.0/16', # IPv4 private network - 'fe80::/10' # IPv6 linklocal + '167.235.158.251', # IPv4 of check.searx.space + '192.168.0.0/16', # IPv4 private network + 'fe80::/10', # IPv6 linklocal ] + block_ip = [ - '93.184.216.34', # IPv4 of example.org - '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class + '93.184.216.34', # IPv4 of example.org + '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class ] """ @@ -72,7 +73,6 @@ def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bo def ip_is_subnet_of_member_in_list( real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config ) -> Tuple[bool, str]: - for net in cfg.get(list_name, default=[]): try: net = ip_network(net, strict=False) diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 600796380..9e815e194 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -43,17 +43,18 @@ from ipaddress import ( import string import random +import flask -from searx import logger -from searx import valkeydb from searx.valkeylib import secret_hash -from searx.extended_types import SXNG_Request from ._helpers import ( get_network, - get_real_ip, + logger, ) +from . import config +from . import valkeydb + TOKEN_LIVE_TIME = 600 """Lifetime (sec) of limiter's CSS token.""" @@ -69,17 +70,14 @@ TOKEN_KEY = 'SearXNG_limiter.token' logger = logger.getChild('botdetection.link_token') -def is_suspicious(network: IPv4Network | IPv6Network, request: SXNG_Request, renew: bool = False): +def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): """Checks whether a valid ping is exists for this (client) network, if not this request is rated as *suspicious*. If a valid ping exists and argument ``renew`` is ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. """ - valkey_client = valkeydb.client() - if not valkey_client: - return False - + valkey_client = valkeydb.get_valkey_client() ping_key = get_ping_key(network, request) if not valkey_client.get(ping_key): logger.info("missing ping (IP: %s) / request: %s", network.compressed, ping_key) @@ -92,28 +90,29 @@ def is_suspicious(network: IPv4Network | IPv6Network, request: SXNG_Request, ren return False -def ping(request: SXNG_Request, token: str): +def ping(request: flask.Request, token: str): """This function is called by a request to URL ``/client.css``. If ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. """ - from . import valkey_client, cfg # pylint: disable=import-outside-toplevel, cyclic-import + valkey_client = valkeydb.get_valkey_client() + cfg = config.get_global_cfg() - if not valkey_client: - return if not token_is_valid(token): return - real_ip = ip_address(get_real_ip(request)) + real_ip = ip_address(request.remote_addr) # type: ignore network = get_network(real_ip, cfg) ping_key = get_ping_key(network, request) - logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) + logger.debug( + "store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip.compressed, ping_key + ) valkey_client.set(ping_key, 1, ex=PING_LIVE_TIME) -def get_ping_key(network: IPv4Network | IPv6Network, request: SXNG_Request) -> str: +def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: """Generates a hashed key that fits (more or less) to a *WEB-browser session* in a network.""" return ( @@ -134,20 +133,23 @@ def token_is_valid(token) -> bool: def get_token() -> str: """Returns current token. If there is no currently active token a new token - is generated randomly and stored in the valkey DB. + is generated randomly and stored in the Valkey DB. Without without a + database connection, string "12345678" is returned. - :py:obj:`TOKEN_LIVE_TIME` - :py:obj:`TOKEN_KEY` """ - valkey_client = valkeydb.client() - if not valkey_client: + try: + valkey_client = valkeydb.get_valkey_client() + except ValueError: # This function is also called when limiter is inactive / no valkey DB # (see render function in webapp.py) return '12345678' + token = valkey_client.get(TOKEN_KEY) if token: - token = token.decode('UTF-8') + token = token.decode('UTF-8') # type: ignore else: token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) valkey_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) diff --git a/searx/botdetection/trusted_proxies.py b/searx/botdetection/trusted_proxies.py new file mode 100644 index 000000000..7191f0eb2 --- /dev/null +++ b/searx/botdetection/trusted_proxies.py @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Implementation of a middleware to determine the real IP of an HTTP request +(:py:obj:`flask.request.remote_addr`) behind a proxy chain.""" +# pylint: disable=too-many-branches + +from __future__ import annotations +import typing as t + +from collections import abc +from ipaddress import IPv4Address, IPv6Address, ip_address, ip_network, IPv4Network, IPv6Network +from werkzeug.http import parse_list_header + +from . import config +from ._helpers import log_error_only_once, logger + +if t.TYPE_CHECKING: + from _typeshed.wsgi import StartResponse + from _typeshed.wsgi import WSGIApplication + from _typeshed.wsgi import WSGIEnvironment + + +class ProxyFix: + """A middleware like the ProxyFix_ class, where the `x_for` argument is + replaced by a method that determines the number of trusted proxies via + the `botdetection.trusted_proxies` setting. + + .. sidebar:: :py:obj:`flask.Request.remote_addr` + + SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). + + The remote IP (py:obj:`flask.Request.remote_addr`) of the request is taken + from (first match): + + - X-Forwarded-For_: If the header is set, the first untrusted IP that comes + before the IPs that are still part of the ``botdetection.trusted_proxies`` + is used. + + - `X-Real-IP `__: + If X-Forwarded-For_ is not set, `X-Real-IP` is used + (``botdetection.trusted_proxies`` is ignored). + + If none of the header is set, the REMOTE_ADDR_ from the WSGI layer is used. + If (for whatever reasons) none IP can be determined, an error message is + displayed and ``100::`` is used instead (:rfc:`6666`). + + .. _ProxyFix: + https://werkzeug.palletsprojects.com/middleware/proxy_fix/ + + .. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + + .. _REMOTE_ADDR: + https://wsgi.readthedocs.io/en/latest/proposals-2.0.html#making-some-keys-required + + """ + + def __init__(self, wsgi_app: WSGIApplication) -> None: + self.wsgi_app = wsgi_app + + def trusted_proxies(self) -> list[IPv4Network | IPv6Network]: + cfg = config.get_global_cfg() + proxy_list: list[str] = cfg.get("botdetection.trusted_proxies", default=[]) + return [ip_network(net, strict=False) for net in proxy_list] + + def trusted_remote_addr( + self, + x_forwarded_for: list[IPv4Address | IPv6Address], + trusted_proxies: list[IPv4Network | IPv6Network], + ) -> str: + # always rtl + for addr in reversed(x_forwarded_for): + trust: bool = False + + for net in trusted_proxies: + if addr.version == net.version and addr in net: + logger.debug("trust proxy %s (member of %s)", addr, net) + trust = True + break + + # client address + if not trust: + return addr.compressed + + # fallback to first address + return x_forwarded_for[0].compressed + + def __call__(self, environ: WSGIEnvironment, start_response: StartResponse) -> abc.Iterable[bytes]: + # pylint: disable=too-many-statements + + trusted_proxies = self.trusted_proxies() + + # We do not rely on the REMOTE_ADDR from the WSGI environment / the + # variable is first removed from the WSGI environment and explicitly set + # in this function! + + orig_remote_addr: str | None = environ.pop("REMOTE_ADDR") + + # Validate the IPs involved in this game and delete all invalid ones + # from the WSGI environment. + + if orig_remote_addr: + try: + addr = ip_address(orig_remote_addr) + if addr.version == 6 and addr.ipv4_mapped: + addr = addr.ipv4_mapped + orig_remote_addr = addr.compressed + except ValueError as exc: + logger.error("REMOTE_ADDR: %s / discard REMOTE_ADDR from WSGI environment", exc) + orig_remote_addr = None + + x_real_ip: str | None = environ.get("HTTP_X_REAL_IP") + if x_real_ip: + try: + addr = ip_address(x_real_ip) + if addr.version == 6 and addr.ipv4_mapped: + addr = addr.ipv4_mapped + x_real_ip = addr.compressed + except ValueError as exc: + logger.error("X-Real-IP: %s / discard HTTP_X_REAL_IP from WSGI environment", exc) + environ.pop("HTTP_X_REAL_IP") + x_real_ip = None + + x_forwarded_for: list[IPv4Address | IPv6Address] = [] + if environ.get("HTTP_X_FORWARDED_FOR"): + for x_for_ip in parse_list_header(str(environ.get("HTTP_X_FORWARDED_FOR"))): + try: + addr = ip_address(x_for_ip) + except ValueError as exc: + logger.error("X-Forwarded-For: %s / discard HTTP_X_FORWARDED_FOR from WSGI environment", exc) + environ.pop("HTTP_X_FORWARDED_FOR") + x_forwarded_for = [] + break + + if addr.version == 6 and addr.ipv4_mapped: + addr = addr.ipv4_mapped + x_forwarded_for.append(addr) + + # log questionable WSGI environments + + if not x_forwarded_for and not x_real_ip: + log_error_only_once("X-Forwarded-For nor X-Real-IP header is set!") + + if x_forwarded_for and not trusted_proxies: + log_error_only_once("missing botdetection.trusted_proxies config") + # without trusted_proxies, this variable is useless for determining + # the real IP + x_forwarded_for = [] + + # securing the WSGI environment variables that are adjusted + + environ.update({"botdetection.trusted_proxies.orig": {"REMOTE_ADDR": orig_remote_addr}}) + + # determine *the real IP* + + if x_forwarded_for: + environ["REMOTE_ADDR"] = self.trusted_remote_addr(x_forwarded_for, trusted_proxies) + + elif x_real_ip: + environ["REMOTE_ADDR"] = x_real_ip + + elif orig_remote_addr: + environ["REMOTE_ADDR"] = orig_remote_addr + + else: + logger.error("No remote IP could be determined, use black-hole address: 100::") + environ["REMOTE_ADDR"] = "100::" + + try: + _ = ip_address(environ["REMOTE_ADDR"]) + except ValueError as exc: + logger.error("REMOTE_ADDR: %s, use black-hole address: 100::", exc) + environ["REMOTE_ADDR"] = "100::" + + logger.debug("final REMOTE_ADDR is: %s", environ["REMOTE_ADDR"]) + return self.wsgi_app(environ, start_response) diff --git a/searx/botdetection/valkeydb.py b/searx/botdetection/valkeydb.py new file mode 100644 index 000000000..3b8699786 --- /dev/null +++ b/searx/botdetection/valkeydb.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Providing a Valkey database for the botdetection methods.""" + +from __future__ import annotations + +import valkey + +__all__ = ["set_valkey_client", "get_valkey_client"] + +CLIENT: valkey.Valkey | None = None +"""Global Valkey DB connection (Valkey client object).""" + + +def set_valkey_client(valkey_client: valkey.Valkey): + global CLIENT # pylint: disable=global-statement + CLIENT = valkey_client + + +def get_valkey_client() -> valkey.Valkey: + if CLIENT is None: + raise ValueError("No connection to the Valkey database has been established.") + return CLIENT diff --git a/searx/compat.py b/searx/compat.py index 035726469..2f45eb0e4 100644 --- a/searx/compat.py +++ b/searx/compat.py @@ -8,6 +8,8 @@ __all__ = [ ] import sys +import warnings + # TOML (lib) compatibility # ------------------------ @@ -16,3 +18,36 @@ if sys.version_info >= (3, 11): import tomllib else: import tomli as tomllib + + +# limiter backward compatibility +# ------------------------------ + +LIMITER_CFG_DEPRECATED = { + "real_ip": "limiter: config section 'real_ip' is deprecated", + "real_ip.x_for": "real_ip.x_for has been replaced by botdetection.trusted_proxies", + "real_ip.ipv4_prefix": "real_ip.ipv4_prefix has been replaced by botdetection.ipv4_prefix", + "real_ip.ipv6_prefix": "real_ip.ipv6_prefix has been replaced by botdetection.ipv6_prefix'", +} + + +def limiter_fix_cfg(cfg, cfg_file): + + kwargs = { + "category": DeprecationWarning, + "filename": str(cfg_file), + "lineno": 0, + "module": "searx.limiter", + } + + for opt, msg in LIMITER_CFG_DEPRECATED.items(): + try: + val = cfg.get(opt) + except KeyError: + continue + + warnings.warn_explicit(msg, **kwargs) + if opt == "real_ip.ipv4_prefix": + cfg.set("botdetection.ipv4_prefix", val) + if opt == "real_ip.ipv6_prefix": + cfg.set("botdetection.ipv6_prefix", val) diff --git a/searx/flaskfix.py b/searx/flaskfix.py index f2a54bdfc..4282824a3 100644 --- a/searx/flaskfix.py +++ b/searx/flaskfix.py @@ -3,7 +3,6 @@ from urllib.parse import urlparse -from werkzeug.middleware.proxy_fix import ProxyFix from werkzeug.serving import WSGIRequestHandler from searx import settings @@ -73,5 +72,5 @@ class ReverseProxyPathFix: def patch_application(app): # serve pages with HTTP/1.1 WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server']['http_protocol_version']) - # patch app to handle non root url-s behind proxy & wsgi - app.wsgi_app = ReverseProxyPathFix(ProxyFix(app.wsgi_app)) + # patch app to handle non root url-s behind proxy + app.wsgi_app = ReverseProxyPathFix(app.wsgi_app) diff --git a/searx/limiter.py b/searx/limiter.py index 99bc338d1..2b889157a 100644 --- a/searx/limiter.py +++ b/searx/limiter.py @@ -93,13 +93,14 @@ Implementation """ from __future__ import annotations +from ipaddress import ip_address import sys from pathlib import Path -from ipaddress import ip_address import flask import werkzeug +import searx.compat from searx import ( logger, valkeydb, @@ -116,7 +117,6 @@ from searx.botdetection import ( ip_limit, ip_lists, get_network, - get_real_ip, dump_request, ) @@ -124,25 +124,24 @@ from searx.botdetection import ( # coherency, the logger is "limiter" logger = logger.getChild('limiter') -CFG: config.Config = None # type: ignore +CFG: config.Config | None = None # type: ignore _INSTALLED = False LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" """Base configuration (schema) of the botdetection.""" -CFG_DEPRECATED = { - # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." -} - def get_cfg() -> config.Config: + """Returns SearXNG's global limiter configuration.""" global CFG # pylint: disable=global-statement if CFG is None: from . import settings_loader # pylint: disable=import-outside-toplevel cfg_file = (settings_loader.get_user_cfg_folder() or Path("/etc/searxng")) / "limiter.toml" - CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, cfg_file, CFG_DEPRECATED) + CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, cfg_file, searx.compat.LIMITER_CFG_DEPRECATED) + searx.compat.limiter_fix_cfg(CFG, cfg_file) + return CFG @@ -150,7 +149,7 @@ def filter_request(request: SXNG_Request) -> werkzeug.Response | None: # pylint: disable=too-many-return-statements cfg = get_cfg() - real_ip = ip_address(get_real_ip(request)) + real_ip = ip_address(request.remote_addr) network = get_network(real_ip, cfg) if request.path == '/healthz': diff --git a/searx/limiter.toml b/searx/limiter.toml index b64a7bf28..0b40bf81f 100644 --- a/searx/limiter.toml +++ b/searx/limiter.toml @@ -1,8 +1,4 @@ -[real_ip] - -# Number of values to trust for X-Forwarded-For. - -x_for = 1 +[botdetection] # The prefix defines the number of leading bits in an address that are compared # to determine whether or not an address is part of a (client) network. @@ -10,6 +6,19 @@ x_for = 1 ipv4_prefix = 32 ipv6_prefix = 48 +# If the request IP is in trusted_proxies list, the client IP address is +# extracted from the X-Forwarded-For and X-Real-IP headers. This should be +# used if SearXNG is behind a reverse proxy or load balancer. + +trusted_proxies = [ + '127.0.0.0/8', + '::1', + # '192.168.0.0/16', + # '172.16.0.0/12', + # '10.0.0.0/8', + # 'fd00::/8', +] + [botdetection.ip_limit] # To get unlimited access in a local network, by default link-local addresses @@ -37,4 +46,4 @@ pass_ip = [ # Activate passlist of (hardcoded) IPs from the SearXNG organization, # e.g. `check.searx.space`. -pass_searxng_org = true \ No newline at end of file +pass_searxng_org = true diff --git a/searx/plugins/self_info.py b/searx/plugins/self_info.py index ef035e683..1c51049a5 100644 --- a/searx/plugins/self_info.py +++ b/searx/plugins/self_info.py @@ -4,9 +4,10 @@ from __future__ import annotations import typing import re +from ipaddress import ip_address + from flask_babel import gettext -from searx.botdetection._helpers import get_real_ip from searx.result_types import EngineResults from . import Plugin, PluginInfo @@ -48,8 +49,10 @@ class SXNGPlugin(Plugin): if search.search_query.pageno > 1: return results - if self.ip_regex.search(search.search_query.query): - results.add(results.types.Answer(answer=gettext("Your IP is: ") + get_real_ip(request))) + if self.ip_regex.search(search.search_query.query) and request.remote_addr: + results.add( + results.types.Answer(answer=gettext("Your IP is: ") + ip_address(request.remote_addr).compressed) + ) if self.ua_regex.match(search.search_query.query): results.add(results.types.Answer(answer=gettext("Your user-agent is: ") + str(request.user_agent))) diff --git a/searx/plugins/tor_check.py b/searx/plugins/tor_check.py index 3338ff2ed..93506ff5a 100644 --- a/searx/plugins/tor_check.py +++ b/searx/plugins/tor_check.py @@ -5,6 +5,7 @@ user searches for ``tor-check``. It fetches the tor exit node list from user's IP address is in it. """ from __future__ import annotations +from ipaddress import ip_address import typing import re @@ -14,7 +15,6 @@ from httpx import HTTPError from searx.network import get from searx.plugins import Plugin, PluginInfo from searx.result_types import EngineResults -from searx.botdetection import get_real_ip if typing.TYPE_CHECKING: from searx.search import SearchWithPlugins @@ -66,7 +66,7 @@ class SXNGPlugin(Plugin): results.add(results.types.Answer(answer=f"{msg} {url_exit_list}")) return results - real_ip = get_real_ip(request) + real_ip = ip_address(address=str(request.remote_addr)).compressed if real_ip in node_list: msg = gettext("You are using Tor and it looks like you have the external IP address") diff --git a/searx/valkeydb.py b/searx/valkeydb.py index 2817c6d0a..3a7be1fd9 100644 --- a/searx/valkeydb.py +++ b/searx/valkeydb.py @@ -17,6 +17,7 @@ A valkey DB connect can be tested by:: >>> """ +from __future__ import annotations import os import pwd @@ -26,12 +27,12 @@ import warnings import valkey from searx import get_setting - -_CLIENT = None +_CLIENT: valkey.Valkey | None = None logger = logging.getLogger(__name__) -def client() -> valkey.Valkey: +def client() -> valkey.Valkey | None: + """Returns SearXNG's global Valkey DB connector (Valkey client object).""" return _CLIENT diff --git a/searx/webapp.py b/searx/webapp.py index 906ec93e4..4179c32b0 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -57,7 +57,7 @@ from searx import ( from searx import infopage from searx import limiter -from searx.botdetection import link_token +from searx.botdetection import link_token, ProxyFix from searx.data import ENGINE_DESCRIPTIONS from searx.result_types import Answer @@ -1391,6 +1391,7 @@ def static_headers(headers: Headers, _path: str, _url: str) -> None: headers[header] = str(value) +app.wsgi_app = ProxyFix(app.wsgi_app) app.wsgi_app = WhiteNoise( app.wsgi_app, root=settings['ui']['static_path'], diff --git a/tests/__init__.py b/tests/__init__.py index 55a002196..9c176aedf 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -89,4 +89,4 @@ class SearxTestCase(aiounittest.AsyncTestCase): # pylint: disable=attribute-defined-outside-init self.app = searx.webapp.app - self.client = searx.webapp.app.test_client() + self.client = self.app.test_client() diff --git a/tests/unit/test_plugin_self_info.py b/tests/unit/test_plugin_self_info.py index 4a2e6c416..f67a4f25f 100644 --- a/tests/unit/test_plugin_self_info.py +++ b/tests/unit/test_plugin_self_info.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring,disable=missing-class-docstring,invalid-name +# pylint: disable=missing-module-docstring,disable=missing-class-docstring,invalid-name,line-too-long from parameterized.parameterized import parameterized @@ -35,19 +35,85 @@ class PluginIPSelfInfo(SearxTestCase): def test_plugin_store_init(self): self.assertEqual(1, len(self.storage)) - def test_pageno_1_2(self): + def test_IPv4_X_Forwarded_For(self): + headers = {"X-Forwarded-For": "1.2.3.4, 127.0.0.1"} + answer = gettext("Your IP is: ") + "1.2.3.4" - with self.app.test_request_context(): - sxng_request.preferences = self.pref - sxng_request.remote_addr = "127.0.0.1" - sxng_request.headers = {"X-Forwarded-For": "1.2.3.4, 127.0.0.1", "X-Real-IP": "127.0.0.1"} # type: ignore - answer = Answer(answer=gettext("Your IP is: ") + "127.0.0.1") + result = self.client.post("/search", data={"q": "ip"}, headers=headers) + self.assertIn(answer, str(result.data)) + result = self.client.post("/search", data={"q": "ip", "pageno": "2"}, headers=headers) + self.assertNotIn(answer, str(result.data)) - search = do_post_search("ip", self.storage, pageno=1) - self.assertIn(answer, search.result_container.answers) + def test_IPv6_X_Forwarded_For(self): + headers = { + "X-Forwarded-For": "fd0f:a306:f289:0000:0000:0000:ffff:bbbb, ::1, 127.0.0.1", + "X-Real-IP": "fd0f:a306:f289:0000:0000:0000:ffff:aaaa", + } + # value from X-Forwarded-For should win + answer = gettext("Your IP is: ") + "fd0f:a306:f289::ffff:bbbb" + result = self.client.post("/search", data={"q": "ip"}, headers=headers) + self.assertIn(answer, str(result.data)) - search = do_post_search("ip", self.storage, pageno=2) - self.assertEqual(list(search.result_container.answers), []) + def test_IPv6_X_Forwarded_For_all_trusted(self): + headers = { + "X-Forwarded-For": "127.0.0.1, 127.0.0.2, 127.0.0.3, ::1", + } + # value from X-Forwarded-For should win + answer = gettext("Your IP is: ") + "127.0.0.1" + result = self.client.post("/search", data={"q": "ip"}, headers=headers) + self.assertIn(answer, str(result.data)) + + def test_IPv6_X_Real_IP(self): + headers = { + "X-Real-IP": "fd0f:a306:f289:0000:0000:0000:ffff:aaaa", + } + # X-Forwarded-For is not set, X-Real-IP should win + answer = gettext("Your IP is: ") + "fd0f:a306:f289::ffff:aaaa" + result = self.client.post("/search", data={"q": "ip"}, headers=headers) + self.assertIn(answer, str(result.data)) + + def test_REMOTE_ADDR_is_invalid(self): + # X-Forwarded-For and X-Real-IP ar unset and REMOTE_ADDR is invalid + answer = gettext("Your IP is: ") + "100::" + headers = {} + environ_overrides = {"REMOTE_ADDR": "1.2.3.4.5"} + with self.assertLogs("searx.botdetection", level="ERROR") as ctx: + result = self.client.post("/search", data={"q": "ip"}, headers=headers, environ_overrides=environ_overrides) + self.assertIn(answer, str(result.data)) + self.assertIn( + "ERROR:searx.botdetection:REMOTE_ADDR: '1.2.3.4.5' does not appear to be an IPv4 or IPv6 address / discard REMOTE_ADDR from WSGI environment", + ctx.output, + ) + + def test_X_Real_IP_is_invalid(self): + # when a client fakes a X-Real-IP header with an invalid IP 1.2.3.4.5 in + answer = gettext("Your IP is: ") + "96.7.128.186" + headers = {"X-Real-IP": "1.2.3.4.5", "X-Forwarded-For": "96.7.128.186, 127.0.0.1"} + environ_overrides = {"REMOTE_ADDR": "127.0.0.1"} + + with self.assertLogs("searx.botdetection", level="ERROR") as ctx: + result = self.client.post("/search", data={"q": "ip"}, headers=headers, environ_overrides=environ_overrides) + self.assertIn(answer, str(result.data)) + self.assertIn( + "ERROR:searx.botdetection:X-Real-IP: '1.2.3.4.5' does not appear to be an IPv4 or IPv6 address / discard HTTP_X_REAL_IP from WSGI environment", + ctx.output, + ) + + def test_X_Forwarded_For_is_invalid(self): + # when a client fakes a X-Forwarded-For header with an invalid IP + # 1.2.3.4.5 in and the Proxy set a X-Real-IP + answer = gettext("Your IP is: ") + "96.7.128.186" + headers = { + "X-Forwarded-For": "1.2.3.4, 1.2.3.4.5, 127.0.0.1", + "X-Real-IP": "96.7.128.186", + } + with self.assertLogs("searx.botdetection", level="ERROR") as ctx: + result = self.client.post("/search", data={"q": "ip"}, headers=headers) + self.assertIn(answer, str(result.data)) + self.assertIn( + "ERROR:searx.botdetection:X-Forwarded-For: '1.2.3.4.5' does not appear to be an IPv4 or IPv6 address / discard HTTP_X_FORWARDED_FOR from WSGI environment", + ctx.output, + ) @parameterized.expand( [