From c2bf60455abc2cfeb9ebe95b070d2961250a795f Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 17 May 2025 18:44:43 +0200 Subject: [PATCH 1/2] [enh] network: add impersonate parameter see * https://github.com/lexiforest/curl_cffi * https://github.com/vgavro/httpx-curl-cffi --- requirements.txt | 1 + searx/network/client.py | 38 ++++++++++++++++++++++++++++++++++++-- searx/network/network.py | 7 ++++++- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index b4006c48e..c3f3864ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ python-dateutil==2.9.0.post0 pyyaml==6.0.2 httpx[http2]==0.28.1 httpx-socks[asyncio]==0.10.0 +httpx_curl_cffi==0.1.3 ; python_version >= "3.10" and (platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "x86_64") Brotli==1.1.0 uvloop==0.21.0 setproctitle==1.3.6 diff --git a/searx/network/client.py b/searx/network/client.py index f35ba2d6e..20b744462 100644 --- a/searx/network/client.py +++ b/searx/network/client.py @@ -11,8 +11,16 @@ from typing import Any, Dict import httpx from httpx_socks import AsyncProxyTransport from python_socks import parse_proxy_url, ProxyConnectionError, ProxyTimeoutError, ProxyError + import uvloop +try: + from httpx_curl_cffi import AsyncCurlTransport, CurlOpt, CurlHttpVersion +except ImportError: + AsyncCurlTransport = None + CurlOpt = None + CurlHttpVersion = None + from searx import logger @@ -152,6 +160,7 @@ def get_transport(verify, http2, local_address, proxy_url, limit, retries): def new_client( # pylint: disable=too-many-arguments + impersonate, enable_http, verify, enable_http2, @@ -169,12 +178,27 @@ def new_client( max_keepalive_connections=max_keepalive_connections, keepalive_expiry=keepalive_expiry, ) + if impersonate and (AsyncCurlTransport is None or CurlOpt is None): + raise ValueError("impersonate requires the AMD64 or ARM64 architecture") + # See https://www.python-httpx.org/advanced/#routing mounts = {} for pattern, proxy_url in proxies.items(): if not enable_http and pattern.startswith('http://'): continue - if proxy_url.startswith('socks4://') or proxy_url.startswith('socks5://') or proxy_url.startswith('socks5h://'): + if impersonate and AsyncCurlTransport is not None and CurlOpt is not None and CurlHttpVersion is not None: + mounts[pattern] = AsyncCurlTransport( + impersonate=impersonate, + default_headers=True, + # required for parallel requests, see curl_cffi issues below + curl_options={CurlOpt.FRESH_CONNECT: True}, + http_version=CurlHttpVersion.V3 if enable_http2 else CurlHttpVersion.V1_1, + proxy=proxy_url, + local_address=local_address, + ) + elif ( + proxy_url.startswith('socks4://') or proxy_url.startswith('socks5://') or proxy_url.startswith('socks5h://') + ): mounts[pattern] = get_transport_for_socks_proxy( verify, enable_http2, local_address, proxy_url, limit, retries ) @@ -184,7 +208,17 @@ def new_client( if not enable_http: mounts['http://'] = AsyncHTTPTransportNoHttp() - transport = get_transport(verify, enable_http2, local_address, None, limit, retries) + if impersonate and AsyncCurlTransport is not None and CurlOpt is not None and CurlHttpVersion is not None: + transport = AsyncCurlTransport( + impersonate=impersonate, + default_headers=True, + # required for parallel requests, see curl_cffi issues below + curl_options={CurlOpt.FRESH_CONNECT: True}, + http_version=CurlHttpVersion.V3 if enable_http2 else CurlHttpVersion.V1_1, + local_address=local_address, + ) + else: + transport = get_transport(verify, enable_http2, local_address, None, limit, retries) event_hooks = None if hook_log_response: diff --git a/searx/network/network.py b/searx/network/network.py index 178ebcbf2..89eb7a623 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -53,6 +53,7 @@ class Network: 'max_redirects', 'retries', 'retry_on_http_error', + 'impersonate', '_local_addresses_cycle', '_proxies_cycle', '_clients', @@ -63,6 +64,7 @@ class Network: def __init__( # pylint: disable=too-many-arguments + # pylint: disable=too-many-positional-arguments self, enable_http=True, verify=True, @@ -77,6 +79,7 @@ class Network: retry_on_http_error=None, max_redirects=30, logger_name=None, + impersonate=None, ): self.enable_http = enable_http @@ -91,6 +94,7 @@ class Network: self.retries = retries self.retry_on_http_error = retry_on_http_error self.max_redirects = max_redirects + self.impersonate = impersonate self._local_addresses_cycle = self.get_ipaddress_cycle() self._proxies_cycle = self.get_proxy_cycles() self._clients = {} @@ -185,10 +189,11 @@ class Network: max_redirects = self.max_redirects if max_redirects is None else max_redirects local_address = next(self._local_addresses_cycle) proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key - key = (verify, max_redirects, local_address, proxies) + key = (verify, max_redirects, local_address, proxies, self.impersonate) hook_log_response = self.log_response if sxng_debug else None if key not in self._clients or self._clients[key].is_closed: client = new_client( + self.impersonate, self.enable_http, verify, self.enable_http2, From c3273bc04bb5242ea25234e30f6d284edda0e78d Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 17 May 2025 19:13:35 +0200 Subject: [PATCH 2/2] [mod] qwant engine: impersonate chrome --- searx/engines/qwant.py | 6 ++++++ searx/settings.yml | 2 ++ 2 files changed, 8 insertions(+) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 4a8311199..945150e83 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -148,6 +148,12 @@ def request(query, params): args['offset'] = (params['pageno'] - 1) * args['count'] params['url'] = url + urlencode(args) + params['headers'] = { + "Accept": "application/json", + "Accept-Language": q_locale.replace("_", "-"), + "Referer": "https://www.qwant.com/", + "Origin": "https://www.qwant.com", + } return params diff --git a/searx/settings.yml b/searx/settings.yml index 64affcadc..046ec424d 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1711,6 +1711,8 @@ engines: disabled: true additional_tests: rosebud: *test_rosebud + network: + impersonate: chrome - name: qwant news qwant_categ: news