searxng/searx/results.py
Markus Heiser 57b9673efb [mod] addition of various type hints / tbc
- pyright configuration [1]_
- stub files: types-lxml [2]_
- addition of various type hints
- enable use of new type system features on older Python versions [3]_
- ``.tool-versions`` - set python to lowest version we support (3.10.18) [4]_:
  Older versions typically lack some typing features found in newer Python
  versions.  Therefore, for local type checking (before commit), it is necessary
  to use the older Python interpreter.

.. [1] https://docs.basedpyright.com/v1.20.0/configuration/config-files/
.. [2] https://pypi.org/project/types-lxml/
.. [3] https://typing-extensions.readthedocs.io/en/latest/#
.. [4] https://mise.jdx.dev/configuration.html#tool-versions

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Format: reST
2025-09-03 13:37:36 +02:00

381 lines
14 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, missing-class-docstring
import typing as t
import warnings
from collections import defaultdict
from threading import RLock
from searx import logger as log
import searx.engines
from searx.metrics import histogram_observe, counter_add
from searx.result_types import Result, LegacyResult, MainResult
from searx.result_types.answer import AnswerSet, BaseAnswer
def calculate_score(
result: MainResult | LegacyResult,
priority: MainResult.PriorityType,
) -> float:
weight = 1.0
for result_engine in result['engines']:
if hasattr(searx.engines.engines.get(result_engine), 'weight'):
weight *= float(searx.engines.engines[result_engine].weight)
weight *= len(result['positions'])
score = 0
for position in result['positions']:
if priority == 'low':
continue
if priority == 'high':
score += weight
else:
score += weight / position
return score
class Timing(t.NamedTuple):
engine: str
total: float
load: float
class UnresponsiveEngine(t.NamedTuple):
engine: str
error_type: str
suspended: bool
class ResultContainer:
"""In the result container, the results are collected, sorted and duplicates
will be merged."""
# pylint: disable=too-many-statements
main_results_map: dict[int, MainResult | LegacyResult]
infoboxes: list[LegacyResult]
suggestions: set[str]
answers: AnswerSet
corrections: set[str]
def __init__(self):
self.main_results_map = {}
self.infoboxes = []
self.suggestions = set()
self.answers = AnswerSet()
self.corrections = set()
self._number_of_results: list[int] = []
self.engine_data: dict[str, dict[str, str]] = defaultdict(dict)
self._closed: bool = False
self.paging: bool = False
self.unresponsive_engines: set[UnresponsiveEngine] = set()
self.timings: list[Timing] = []
self.redirect_url: str | None = None
self.on_result: t.Callable[[Result | LegacyResult], bool] = lambda _: True
self._lock: RLock = RLock()
self._main_results_sorted: list[MainResult | LegacyResult] = None # type: ignore
def extend(
self, engine_name: str | None, results: list[Result | LegacyResult]
): # pylint: disable=too-many-branches
if self._closed:
log.debug("container is closed, ignoring results: %s", results)
return
main_count = 0
for result in list(results):
if isinstance(result, Result):
result.engine = result.engine or engine_name
result.normalize_result_fields()
if not self.on_result(result):
continue
if isinstance(result, BaseAnswer):
self.answers.add(result)
elif isinstance(result, MainResult):
main_count += 1
self._merge_main_result(result, main_count)
else:
# more types need to be implemented in the future ..
raise NotImplementedError(f"no handler implemented to process the result of type {result}")
else:
result["engine"] = result.get("engine") or engine_name or ""
result = LegacyResult(result) # for backward compatibility, will be romeved one day
result.normalize_result_fields()
if "suggestion" in result:
if self.on_result(result):
self.suggestions.add(result["suggestion"])
continue
if "answer" in result:
if self.on_result(result):
warnings.warn(
f"answer results from engine {result.engine}"
" are without typification / migrate to Answer class.",
DeprecationWarning,
)
self.answers.add(result) # type: ignore
continue
if "correction" in result:
if self.on_result(result):
self.corrections.add(result["correction"])
continue
if "infobox" in result:
if self.on_result(result):
self._merge_infobox(result)
continue
if "number_of_results" in result:
if self.on_result(result):
self._number_of_results.append(result["number_of_results"])
continue
if "engine_data" in result:
if self.on_result(result):
if result.engine:
self.engine_data[result.engine][result["key"]] = result["engine_data"]
continue
if self.on_result(result):
main_count += 1
self._merge_main_result(result, main_count)
continue
if engine_name in searx.engines.engines:
eng = searx.engines.engines[engine_name]
histogram_observe(main_count, "engine", eng.name, "result", "count")
if not self.paging and eng.paging:
self.paging = True
def _merge_infobox(self, new_infobox: LegacyResult):
add_infobox = True
new_id = getattr(new_infobox, "id", None)
if new_id is not None:
with self._lock:
for existing_infobox in self.infoboxes:
if new_id == getattr(existing_infobox, "id", None):
merge_two_infoboxes(existing_infobox, new_infobox)
add_infobox = False
if add_infobox:
self.infoboxes.append(new_infobox)
def _merge_main_result(self, result: MainResult | LegacyResult, position: int):
result_hash = hash(result)
with self._lock:
merged = self.main_results_map.get(result_hash)
if not merged:
# if there is no duplicate in the merged results, append result
result.positions = [position]
self.main_results_map[result_hash] = result
return
merge_two_main_results(merged, result)
# add the new position
merged.positions.append(position)
def close(self):
self._closed = True
for result in self.main_results_map.values():
result.score = calculate_score(result, result.priority)
for eng_name in result.engines:
counter_add(result.score, 'engine', eng_name, 'score')
def get_ordered_results(self) -> list[MainResult | LegacyResult]:
"""Returns a sorted list of results to be displayed in the main result
area (:ref:`result types`)."""
if not self._closed:
self.close()
if self._main_results_sorted:
return self._main_results_sorted
# first pass, sort results by "score" (descanding)
results = sorted(self.main_results_map.values(), key=lambda x: x.score, reverse=True)
# pass 2 : group results by category and template
gresults: list[MainResult | LegacyResult] = []
categoryPositions: dict[str, t.Any] = {}
max_count = 8
max_distance = 20
for res in results:
# do we need to handle more than one category per engine?
engine = searx.engines.engines.get(res.engine or "")
if engine:
res.category = engine.categories[0] if len(engine.categories) > 0 else ""
# do we need to handle more than one category per engine?
category = f"{res.category}:{res.template}:{'img_src' if (res.thumbnail or res.img_src) else ''}"
grp = categoryPositions.get(category)
# group with previous results using the same category, if the group
# can accept more result and is not too far from the current
# position
if (grp is not None) and (grp["count"] > 0) and (len(gresults) - grp["index"] < max_distance):
# group with the previous results using the same category with
# this one
index = grp["index"]
gresults.insert(index, res)
# update every index after the current one (including the
# current one)
for item in categoryPositions.values():
v = item["index"]
if v >= index:
item["index"] = v + 1
# update this category
grp["count"] -= 1
else:
gresults.append(res)
# update categoryIndex
categoryPositions[category] = {"index": len(gresults), "count": max_count}
continue
self._main_results_sorted = gresults
return self._main_results_sorted
@property
def number_of_results(self) -> int:
"""Returns the average of results number, returns zero if the average
result number is smaller than the actual result count."""
if not self._closed:
log.error("call to ResultContainer.number_of_results before ResultContainer.close")
return 0
with self._lock:
resultnum_sum = sum(self._number_of_results)
if not resultnum_sum or not self._number_of_results:
return 0
average = int(resultnum_sum / len(self._number_of_results))
if average < len(self.get_ordered_results()):
average = 0
return average
def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
with self._lock:
if self._closed:
log.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
return
if searx.engines.engines[engine_name].display_error_messages:
self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
with self._lock:
if self._closed:
log.error("call to ResultContainer.add_timing after ResultContainer.close")
return
self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
def get_timings(self) -> list[Timing]:
with self._lock:
if not self._closed:
log.error("call to ResultContainer.get_timings before ResultContainer.close")
return []
return self.timings
def merge_two_infoboxes(origin: LegacyResult, other: LegacyResult):
"""Merges the values from ``other`` into ``origin``."""
# pylint: disable=too-many-branches
weight1 = getattr(searx.engines.engines[origin.engine], "weight", 1)
weight2 = getattr(searx.engines.engines[other.engine], "weight", 1)
if weight2 > weight1:
origin.engine = other.engine
origin.engines |= other.engines
if other.urls:
url_items = origin.get("urls", [])
for url2 in other.urls:
unique_url = True
entity_url2 = url2.get("entity")
for url1 in origin.get("urls", []):
if (entity_url2 is not None and entity_url2 == url1.get("entity")) or (
url1.get("url") == url2.get("url")
):
unique_url = False
break
if unique_url:
url_items.append(url2)
origin.urls = url_items
if other.img_src:
if not origin.img_src:
origin.img_src = other.img_src
elif weight2 > weight1:
origin.img_src = other.img_src
if other.attributes:
if not origin.attributes:
origin.attributes = other.attributes
else:
attr_names_1: set[str] = set()
for attr in origin.attributes:
label = attr.get("label")
if label:
attr_names_1.add(label)
entity = attr.get("entity")
if entity:
attr_names_1.add(entity)
for attr in other.attributes:
if attr.get("label") not in attr_names_1 and attr.get('entity') not in attr_names_1:
origin.attributes.append(attr)
if other.content:
if not origin.content:
origin.content = other.content
elif len(other.content) > len(origin.content):
origin.content = other.content
def merge_two_main_results(origin: MainResult | LegacyResult, other: MainResult | LegacyResult):
"""Merges the values from ``other`` into ``origin``."""
if len(other.content) > len(origin.content):
# use content with more text
origin.content = other.content
# use title with more text
if len(other.title) > len(origin.title):
origin.title = other.title
# merge all result's parameters not found in origin
if isinstance(other, MainResult) and isinstance(origin, MainResult):
origin.defaults_from(other)
elif isinstance(other, LegacyResult) and isinstance(origin, LegacyResult):
origin.defaults_from(other)
# add engine to list of result-engines
origin.engines.add(other.engine or "")
# use https, ftps, .. if possible
if origin.parsed_url and not origin.parsed_url.scheme.endswith("s"):
if other.parsed_url and other.parsed_url.scheme.endswith("s"):
origin.parsed_url = origin.parsed_url._replace(scheme=other.parsed_url.scheme)
origin.url = origin.parsed_url.geturl()