[mod] limiter -> botdetection: modularization and documentation

In order to be able to meet the outstanding requirements, the implementation is
modularized and supplemented with documentation.

This patch does not contain functional change, except it fixes issue #2455

----

Aktivate limiter in the settings.yml and simulate a bot request by::

    curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \
         -H 'Accept: text/html'
         -H 'User-Agent: xyz' \
         -H 'Accept-Encoding: gzip' \
         'http://127.0.0.1:8888/search?q=foo'

In the LOG:

    DEBUG   searx.botdetection.link_token : missing ping for this request: .....

Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time
before you get a "Too Many Requests" response.

Closes: https://github.com/searxng/searxng/issues/2455
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-05-23 18:16:37 +02:00
parent 5226044c13
commit 1ec325adcc
15 changed files with 541 additions and 161 deletions

View file

@ -0,0 +1,26 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _botdetection src:
Bot detection methods
---------------------
The methods implemented in this python package are use by the :ref:`limiter src`.
"""
import flask
def dump_request(request: flask.Request):
return (
"%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path)
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ " || Content-Type: %s" % request.headers.get('Content-Type')
+ " || Content-Length: %s" % request.headers.get('Content-Length')
+ " || Connection: %s" % request.headers.get('Connection')
+ " || User-Agent: %s" % request.headers.get('User-Agent')
)

View file

@ -0,0 +1,24 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept``
----------------------
The ``http_accept`` method evaluates a request as the request of a bot if the
Accept_ header ..
- did not contain ``text/html``
.. _Accept:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
"""
from typing import Optional, Tuple
import flask
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
if 'text/html' not in request.accept_mimetypes:
return 429, "bot detected, HTTP header Accept did not contain text/html"
return None

View file

@ -0,0 +1,26 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept_encoding``
-------------------------------
The ``http_accept_encoding`` method evaluates a request as the request of a
bot if the Accept-Encoding_ header ..
- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
- did not contain ``text/html``
.. _Accept-Encoding:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
"""
from typing import Optional, Tuple
import flask
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if not ('gzip' in accept_list or 'deflate' in accept_list):
return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate"
return None

View file

@ -0,0 +1,23 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept_language``
-------------------------------
The ``http_accept_language`` method evaluates a request as the request of a bot
if the Accept-Language_ header is unset.
.. _Accept-Language:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
"""
from typing import Optional, Tuple
import flask
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
if request.headers.get('Accept-Language', '').strip() == '':
return 429, "bot detected, missing HTTP header Accept-Language"
return None

View file

@ -0,0 +1,23 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_connection``
--------------------------
The ``http_connection`` method evaluates a request as the request of a bot if
the Connection_ header is set to ``close``.
.. _Connection:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
"""
from typing import Optional, Tuple
import flask
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
if request.headers.get('Connection', '').strip() == 'close':
return 429, "bot detected, HTTP header 'Connection=close'"
return None

View file

@ -0,0 +1,54 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_user_agent``
--------------------------
The ``http_user_agent`` method evaluates a request as the request of a bot if
the User-Agent_ header is unset or matches the regular expression
:py:obj:`USER_AGENT`.
.. _User-Agent:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
"""
from typing import Optional, Tuple
import re
import flask
USER_AGENT = (
r'('
+ r'unknown'
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
+ r'|ZmEu|BLEXBot|bitlybot'
# unmaintained Farside instances
+ r'|'
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
# other bots and client to block
+ '|.*PetalBot.*'
+ r')'
)
"""Regular expression that matches to User-Agent_ from known *bots*"""
_regexp = None
def regexp_user_agent():
global _regexp # pylint: disable=global-statement
if not _regexp:
_regexp = re.compile(USER_AGENT)
return _regexp
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
user_agent = request.headers.get('User-Agent', 'unknown')
if regexp_user_agent().match(user_agent):
return (
429,
f"bot detected, HTTP header User-Agent: {user_agent}",
)
return None

View file

@ -0,0 +1,90 @@
"""
Method ``ip_limit``
-------------------
The ``ip_limit`` method counts request from an IP in *sliding windows*. If
there are to many requests in a sliding window, the request is evaluated as a
bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_
header. To take privacy only the hash value of an IP is stored in the redis DB
and at least for a maximum of 10 minutes.
The :py:obj:`link_token` method is used to investigate whether a request is
*suspicious*. If the :py:obj:`link_token` method is activated and a request is
*suspicious* the request rates are reduced:
- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS`
- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS`
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
from typing import Optional, Tuple
import flask
from searx import redisdb
from searx import logger
from searx.redislib import incr_sliding_window
from . import link_token
logger = logger.getChild('botdetection.ip_limit')
BURST_WINDOW = 20
"""Time (sec) before sliding window for *burst* requests expires."""
BURST_MAX = 15
"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`"""
BURST_MAX_SUSPICIOUS = 2
"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`"""
LONG_WINDOW = 600
"""Time (sec) before the longer sliding window expires."""
LONG_MAX = 150
"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`"""
LONG_MAX_SUSPICIOUS = 10
"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
API_WONDOW = 3600
"""Time (sec) before sliding window for API requests (format != html) expires."""
API_MAX = 4
"""Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
redis_client = redisdb.client()
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
if not x_forwarded_for:
logger.error("missing HTTP header X-Forwarded-For")
if request.args.get('format', 'html') != 'html':
c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW)
if c > API_MAX:
return 429, "BLOCK %s: API limit exceeded"
suspicious = link_token.is_suspicious(request)
if suspicious:
c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
if c > BURST_MAX_SUSPICIOUS:
return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS"
c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
if c > LONG_MAX_SUSPICIOUS:
return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS"
else:
c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
if c > BURST_MAX:
return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX"
c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
if c > LONG_MAX:
return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX"
return None

View file

@ -0,0 +1,79 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _limiter src:
Limiter
=======
.. sidebar:: info
The limiter requires a :ref:`Redis <settings redis>` database.
Bot protection / IP rate limitation. The intention of rate limitation is to
limit suspicious requests from an IP. The motivation behind this is the fact
that SearXNG passes through requests from bots and is thus classified as a bot
itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked
by the search engine (the origin) in some other way.
To avoid blocking, the requests from bots to SearXNG must also be blocked, this
is the task of the limiter. To perform this task, the limiter uses the methods
from the :py:obj:`searx.botdetection`.
To enable the limiter activate:
.. code:: yaml
server:
...
limiter: true # rate limit the number of request on the instance, block some bots
and set the redis-url connection. Check the value, it depends on your redis DB
(see :ref:`settings redis`), by example:
.. code:: yaml
redis:
url: unix:///usr/local/searxng-redis/run/redis.sock?db=0
"""
from typing import Optional, Tuple
import flask
from searx.botdetection import (
http_accept,
http_accept_encoding,
http_accept_language,
http_connection,
http_user_agent,
ip_limit,
)
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
if request.path == '/healthz':
return None
for func in [
http_user_agent,
]:
val = func.filter_request(request)
if val is not None:
return val
if request.path == '/search':
for func in [
http_accept,
http_accept_encoding,
http_accept_language,
http_connection,
http_user_agent,
ip_limit,
]:
val = func.filter_request(request)
if val is not None:
return val
return None

View file

@ -0,0 +1,126 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``link_token``
---------------------
The ``link_token`` method evaluates a request as :py:obj:`suspicious
<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
client. By adding a random component (the token) in the URL a bot can not send
a ping by request a static URL.
.. note::
This method requires a redis DB and needs a HTTP X-Forwarded-For_ header.
To get in use of this method a flask URL route needs to be added:
.. code:: python
@app.route('/client<token>.css', methods=['GET', 'POST'])
def client_token(token=None):
link_token.ping(request, token)
return Response('', mimetype='text/css')
And in the HTML template from flask a stylesheet link is needed (the value of
``link_token`` comes from :py:obj:`get_token`):
.. code:: html
<link rel="stylesheet"
href="{{ url_for('client_token', token=link_token) }}"
type="text/css" />
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
import string
import random
import flask
from searx import logger
from searx import redisdb
from searx.redislib import secret_hash
TOKEN_LIVE_TIME = 600
"""Livetime (sec) of limiter's CSS token."""
PING_KEY = 'SearXNG_limiter.ping'
TOKEN_KEY = 'SearXNG_limiter.token'
logger = logger.getChild('botdetection.link_token')
def is_suspicious(request: flask.Request):
"""Checks if there is a valid ping for this request, if not this request is
rated as *suspicious*"""
redis_client = redisdb.client()
if not redis_client:
return False
ping_key = get_ping_key(request)
if not redis_client.get(ping_key):
logger.warning(
"missing ping (IP: %s) / request: %s",
request.headers.get('X-Forwarded-For', ''),
ping_key,
)
return True
logger.debug("found ping for this request: %s", ping_key)
return False
def ping(request: flask.Request, token: str):
"""This function is called by a request to URL ``/client<token>.css``"""
redis_client = redisdb.client()
if not redis_client:
return
if not token_is_valid(token):
return
ping_key = get_ping_key(request)
logger.debug("store ping for: %s", ping_key)
redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME)
def get_ping_key(request: flask.Request):
"""Generates a hashed key that fits (more or less) to a request. At least
X-Forwarded-For_ is needed to be able to assign the request to an IP.
"""
return secret_hash(
PING_KEY
+ request.headers.get('X-Forwarded-For', '')
+ request.headers.get('Accept-Language', '')
+ request.headers.get('User-Agent', '')
)
def token_is_valid(token) -> bool:
valid = token == get_token()
logger.debug("token is valid --> %s", valid)
return valid
def get_token() -> str:
"""Returns current token. If there is no currently active token a new token
is generated randomly and stored in the redis DB.
- :py:obj:`TOKEN_LIVE_TIME`
- :py:obj:`TOKEN_KEY`
"""
redis_client = redisdb.client()
if not redis_client:
# This function is also called when limiter is inactive / no redis DB
# (see render function in webapp.py)
return '12345678'
token = redis_client.get(TOKEN_KEY)
if token:
token = token.decode('UTF-8')
else:
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME)
return token