[mod] limiter -> botdetection: modularization and documentation

In order to be able to meet the outstanding requirements, the implementation is
modularized and supplemented with documentation.

This patch does not contain functional change, except it fixes issue #2455

----

Aktivate limiter in the settings.yml and simulate a bot request by::

    curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \
         -H 'Accept: text/html'
         -H 'User-Agent: xyz' \
         -H 'Accept-Encoding: gzip' \
         'http://127.0.0.1:8888/search?q=foo'

In the LOG:

    DEBUG   searx.botdetection.link_token : missing ping for this request: .....

Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time
before you get a "Too Many Requests" response.

Closes: https://github.com/searxng/searxng/issues/2455
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-05-23 18:16:37 +02:00
parent 5226044c13
commit 1ec325adcc
15 changed files with 541 additions and 161 deletions

View file

@ -1,165 +1,42 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pyright: basic
"""Some bot protection / rate limitation
"""see :ref:`limiter src`"""
To monitor rate limits and protect privacy the IP addresses are getting stored
with a hash so the limiter plugin knows who to block. A redis database is
needed to store the hash values.
Enable the plugin in ``settings.yml``:
- ``server.limiter: true``
- ``redis.url: ...`` check the value, see :ref:`settings redis`
"""
import re
import string
import random
from flask import request
import flask
from searx import redisdb
from searx.plugins import logger
from searx.redislib import incr_sliding_window, secret_hash
from searx.botdetection import limiter
from searx.botdetection import dump_request
name = "Request limiter"
description = "Limit the number of request"
default_on = False
preference_section = 'service'
logger = logger.getChild('limiter')
block_user_agent = re.compile(
r'('
+ r'unknown'
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
+ r'|ZmEu|BLEXBot|bitlybot'
# unmaintained Farside instances
+ r'|'
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
+ '|.*PetalBot.*'
+ r')'
)
PING_KEY = 'SearXNG_limiter.ping'
TOKEN_KEY = 'SearXNG_limiter.token'
def ping():
redis_client = redisdb.client()
user_agent = request.headers.get('User-Agent', 'unknown')
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
ping_key = PING_KEY + user_agent + x_forwarded_for
redis_client.set(secret_hash(ping_key), 1, ex=600)
def get_token():
redis_client = redisdb.client()
if not redis_client:
# This function is also called when limiter is inactive / no redis DB
# (see render function in webapp.py)
return '12345678'
token = redis_client.get(TOKEN_KEY)
if token:
token = token.decode('UTF-8')
else:
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
redis_client.set(TOKEN_KEY, token, ex=600)
return token
def token_is_valid(token):
valid = token == get_token()
logger.debug("token is valid --> %s", valid)
return valid
def is_accepted_request() -> bool:
# pylint: disable=too-many-return-statements
redis_client = redisdb.client()
user_agent = request.headers.get('User-Agent', 'unknown')
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
if request.path == '/healthz':
return True
if block_user_agent.match(user_agent):
logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent))
return False
if request.path == '/search':
c_burst_max = 2
c_10min_max = 10
ping_key = PING_KEY + user_agent + x_forwarded_for
if redis_client.get(secret_hash(ping_key)):
logger.debug('got a ping')
c_burst_max = 15
c_10min_max = 150
else:
logger.debug('missing a ping')
c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20)
c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600)
if c_burst > c_burst_max or c_10min > c_10min_max:
logger.debug("BLOCK %s: too many request", x_forwarded_for)
return False
if len(request.headers.get('Accept-Language', '').strip()) == '':
logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for)
return False
if request.headers.get('Connection') == 'close':
logger.debug("BLOCK %s: got Connection=close", x_forwarded_for)
return False
accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list:
logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for)
return False
if 'text/html' not in request.accept_mimetypes:
logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for)
return False
if request.args.get('format', 'html') != 'html':
c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600)
if c > 4:
logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for)
return False
logger.debug(
"OK %s: '%s'" % (x_forwarded_for, request.path)
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept', '')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language', '')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '')
+ " || Content-Type: %s" % request.headers.get('Content-Type', '')
+ " || Content-Length: %s" % request.headers.get('Content-Length', '')
+ " || Connection: %s" % request.headers.get('Connection', '')
+ " || User-Agent: %s" % user_agent
)
return True
def pre_request():
if not is_accepted_request():
return 'Too Many Requests', 429
"""See :ref:`flask.Flask.before_request`"""
val = limiter.filter_request(flask.request)
if val is not None:
http_status, msg = val
client_ip = flask.request.headers.get('X-Forwarded-For', '<unknown>')
logger.error("BLOCK (IP %s): %s" % (client_ip, msg))
return 'Too Many Requests', http_status
logger.debug("OK: %s" % dump_request(flask.request))
return None
def init(app, settings):
def init(app: flask.Flask, settings) -> bool:
if not settings['server']['limiter']:
return False
if not redisdb.client():
logger.error("The limiter requires Redis") # pylint: disable=undefined-variable
logger.error("The limiter requires Redis")
return False
app.before_request(pre_request)
return True