[fix] engine & network issues / documentation and type annotations

This patch fixes some quirks and issues related to the engines and the network.
Each engine has its own network and this network was broken for the following
engines[1]:

- archlinux
- bing
- dailymotion
- duckduckgo
- google
- peertube
- startpage
- wikipedia

Since the files have been touched anyway, the type annotaions of the engine
modules has also been completed so that error messages from the type checker are
no longer reported.

Related and (partial) fixed issue:

- [1] https://github.com/searxng/searxng/issues/762#issuecomment-1605323861
- [2] https://github.com/searxng/searxng/issues/2513
- [3] https://github.com/searxng/searxng/issues/2515

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-06-25 12:37:31 +02:00 committed by Markus Heiser
parent 2e4a435134
commit e8706fb738
13 changed files with 204 additions and 122 deletions

View file

@ -91,8 +91,8 @@ import dateutil.parser
import lxml.html
import babel
from searx import network
from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import region_tag
from searx.enginelib.traits import EngineTraits
@ -211,25 +211,25 @@ def get_sc_code(searxng_locale, params):
get_sc_url = base_url + '/?sc=%s' % (sc_code)
logger.debug("query new sc time-stamp ... %s", get_sc_url)
logger.debug("headers: %s", headers)
resp = network.get(get_sc_url, headers=headers)
resp = get(get_sc_url, headers=headers)
# ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
raise SearxEngineCaptchaException(
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
)
dom = lxml.html.fromstring(resp.text)
dom = lxml.html.fromstring(resp.text) # type: ignore
try:
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
except IndexError as exc:
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
raise SearxEngineCaptchaException(
message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore
) from exc
sc_code_ts = time()
@ -350,7 +350,7 @@ def _response_cat_web(dom):
title = extract_text(link)
if eval_xpath(result, content_xpath):
content = extract_text(eval_xpath(result, content_xpath))
content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore
else:
content = ''
@ -374,7 +374,7 @@ def _response_cat_web(dom):
date_string = content[0 : date_pos - 5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
# fix content string
content = content[date_pos:]
@ -399,12 +399,12 @@ def fetch_traits(engine_traits: EngineTraits):
'User-Agent': gen_useragent(),
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
}
resp = network.get('https://www.startpage.com/do/settings', headers=headers)
resp = get('https://www.startpage.com/do/settings', headers=headers)
if not resp.ok:
if not resp.ok: # type: ignore
print("ERROR: response from Startpage is not OK.")
dom = lxml.html.fromstring(resp.text)
dom = lxml.html.fromstring(resp.text) # type: ignore
# regions
@ -443,8 +443,10 @@ def fetch_traits(engine_traits: EngineTraits):
# get the native name of every language known by babel
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
native_name = babel.Locale(lang_code).get_language_name().lower()
for lang_code in filter(
lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers() # type: ignore
):
native_name = babel.Locale(lang_code).get_language_name().lower() # type: ignore
# add native name exactly as it is
catalog_engine2code[native_name] = lang_code
@ -478,7 +480,7 @@ def fetch_traits(engine_traits: EngineTraits):
eng_tag = option.get('value')
if eng_tag in skip_eng_tags:
continue
name = extract_text(option).lower()
name = extract_text(option).lower() # type: ignore
sxng_tag = catalog_engine2code.get(eng_tag)
if sxng_tag is None: