diff --git a/docs/dev/engines/online/openalex.rst b/docs/dev/engines/online/openalex.rst new file mode 100644 index 000000000..f04543653 --- /dev/null +++ b/docs/dev/engines/online/openalex.rst @@ -0,0 +1,100 @@ +.. _openalex engine: + +========= +OpenAlex +========= + +Overview +======== + +The OpenAlex engine integrates the `OpenAlex`_ Works API to return scientific paper +results using the :origin:`paper.html ` +template. It is an "online" JSON engine that uses the official public API and does +not require an API key. + +.. _OpenAlex: https://openalex.org +.. _OpenAlex API overview: https://docs.openalex.org/how-to-use-the-api/api-overview + +Key features +------------ + +- Uses the official Works endpoint (JSON) +- Paging support via ``page`` and ``per-page`` +- Relevance sorting (``sort=relevance_score:desc``) +- Language filter support (maps SearXNG language to ``filter=language:``) +- Maps fields commonly used in scholarly results: title, authors, abstract + (reconstructed from inverted index), journal/venue, publisher, DOI, tags + (concepts), PDF/HTML links, pages, volume, issue, published date, and a short + citations comment +- Supports OpenAlex "polite pool" by adding a ``mailto`` parameter + + +Configuration +============= + +Minimal example for :origin:`settings.yml `: + +.. code:: yaml + + - name: openalex + engine: openalex + shortcut: oa + categories: science, scientific publications + timeout: 5.0 + # Recommended by OpenAlex: join the polite pool with an email address + mailto: "[email protected]" + +Notes +----- + +- The ``mailto`` key is optional but recommended by OpenAlex for better service. +- Language is inherited from the user's UI language; when it is not ``all``, the + engine adds ``filter=language:`` (e.g. ``language:fr``). If OpenAlex has + few results for that language, you may see fewer items. +- Results typically include a main link. When the primary landing page from + OpenAlex is a DOI resolver, the engine will use that stable link. When an open + access link is available, it is exposed via the ``PDF`` and/or ``HTML`` links + in the result footer. + + +What is returned +================ + +Each result uses the ``paper.html`` template and may include: + +- ``title`` and ``content`` (abstract; reconstructed from the inverted index) +- ``authors`` (display names) +- ``journal`` (host venue display name) and ``publisher`` +- ``doi`` (normalized to the plain DOI, without the ``https://doi.org/`` prefix) +- ``tags`` (OpenAlex concepts display names) +- ``pdf_url`` (Open access PDF if available) and ``html_url`` (landing page) +- ``publishedDate`` (parsed from ``publication_date``) +- ``pages``, ``volume``, ``number`` (issue) +- ``type`` and a brief ``comments`` string with citation count + + +Rate limits & polite pool +========================= + +OpenAlex offers a free public API with generous daily limits. For extra courtesy +and improved service quality, include a contact email in each request via +``mailto``. You can set it directly in the engine configuration as shown above. +See: `OpenAlex API overview`_. + + +Troubleshooting +=============== + +- Few or no results in a non-English UI language: + Ensure the selected language has sufficient coverage at OpenAlex, or set the + UI language to English and retry. +- Preference changes fail while testing locally: + Make sure your ``server.secret_key`` and ``server.base_url`` are set in your + instance settings so signed cookies work; see :ref:`settings server`. + + +Implementation +=============== + +.. automodule:: searx.engines.openalex + :members: diff --git a/searx/engines/openalex.py b/searx/engines/openalex.py new file mode 100644 index 000000000..c7bb6839b --- /dev/null +++ b/searx/engines/openalex.py @@ -0,0 +1,205 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring +# +# Engine is documented in: docs/dev/engines/online/openalex.rst + +from __future__ import annotations + +import typing as t +from datetime import datetime +from urllib.parse import urlencode +from searx.result_types import EngineResults +from searx.extended_types import SXNG_Response + +# about +about = { + "website": "https://openalex.org/", + "wikidata_id": "Q110718454", + "official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview", + "use_official_api": True, + "require_api_key": False, + "results": "JSON", +} + + +# engine dependent config +categories = ["science", "scientific publications"] +paging = True +search_url = "https://api.openalex.org/works" + +# Optional: include your email for OpenAlex polite pool. Can be set from settings.yml +# engines: - name: openalex; engine: openalex; mailto: "[email protected]" +mailto = "" + + +def request(query: str, params: dict[str, t.Any]) -> None: + # Build OpenAlex query using search parameter and paging + args = { + "search": query, + "page": params["pageno"], + # keep result size moderate; OpenAlex default is 25 + "per-page": 10, + # relevance sorting works only with `search` + "sort": "relevance_score:desc", + } + + # Language filter (expects ISO639-1 like 'fr', 'en') + language = params.get("language") + filters: list[str] = [] + if isinstance(language, str) and language != "all": + iso2 = language.split("-")[0].split("_")[0] + if len(iso2) == 2: + filters.append(f"language:{iso2}") + + if filters: + args["filter"] = ",".join(filters) + + # include mailto if configured for polite pool (engine module setting) + if isinstance(mailto, str) and mailto != "": + args["mailto"] = mailto + + params["url"] = f"{search_url}?{urlencode(args)}" + + +def response(resp: SXNG_Response) -> EngineResults: + data = resp.json() + res = EngineResults() + + for item in data.get("results", []): + url, html_url, pdf_url = _extract_links(item) + title: str = item.get("title", "") + content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or "" + authors = _extract_authors(item) + journal, publisher, pages, volume, number, published_date = _extract_biblio(item) + doi = _doi_to_plain(item.get("doi")) + tags = _extract_tags(item) or None + comments = _extract_comments(item) + + res.add( + res.types.LegacyResult( + template="paper.html", + url=url, + title=title, + content=content, + journal=journal, + publisher=publisher, + doi=doi, + tags=tags, + authors=authors, + pdf_url=pdf_url, + html_url=html_url, + publishedDate=published_date, + pages=pages, + volume=volume, + number=number, + type=item.get("type"), + comments=comments, + ) + ) + + return res + + +def _stringify_pages(biblio: dict[str, t.Any]) -> str | None: + first_page = biblio.get("first_page") + last_page = biblio.get("last_page") + if first_page and last_page: + return f"{first_page}-{last_page}" + if first_page: + return str(first_page) + if last_page: + return str(last_page) + return None + + +def _parse_date(value: str | None) -> datetime | None: + if not value: + return None + # OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD + for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"): + try: + return datetime.strptime(value, fmt) + except ValueError: + continue + return None + + +def _doi_to_plain(doi_value: str | None) -> str | None: + if not doi_value: + return None + # OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd + return doi_value.removeprefix("https://doi.org/") + + +def _reconstruct_abstract( + abstract_inverted_index: dict[str, list[int]] | None, +) -> str | None: + # The abstract is returned as an inverted index {token: [positions...]} + # Reconstruct by placing tokens at their positions and joining with spaces. + if not abstract_inverted_index: + return None + position_to_token: dict[int, str] = {} + max_index = -1 + for token, positions in abstract_inverted_index.items(): + for pos in positions: + position_to_token[pos] = token + max_index = max(max_index, pos) + if max_index < 0: + return None + ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)] + # collapse multiple empty tokens + text = " ".join(t for t in ordered_tokens if t != "") + return text if text != "" else None + + +def _extract_links(item: dict[str, t.Any]) -> tuple[str, str | None, str | None]: + primary_location = item.get("primary_location", {}) + landing_page_url: str | None = primary_location.get("landing_page_url") + work_url: str = item.get("id", "") + url: str = landing_page_url or work_url + open_access = item.get("open_access", {}) + pdf_url: str | None = primary_location.get("pdf_url") or open_access.get("oa_url") + html_url: str | None = landing_page_url + return url, html_url, pdf_url + + +def _extract_authors(item: dict[str, t.Any]) -> list[str]: + authors: list[str] = [] + for auth in item.get("authorships", []): + if not auth: + continue + author_obj = auth.get("author", {}) + display_name = author_obj.get("display_name") + if isinstance(display_name, str) and display_name != "": + authors.append(display_name) + return authors + + +def _extract_tags(item: dict[str, t.Any]) -> list[str]: + tags: list[str] = [] + for c in item.get("concepts", []): + name = (c or {}).get("display_name") + if isinstance(name, str) and name != "": + tags.append(name) + return tags + + +def _extract_biblio( + item: dict[str, t.Any], +) -> tuple[str | None, str | None, str | None, str | None, str | None, datetime | None]: + host_venue = item.get("host_venue", {}) + biblio = item.get("biblio", {}) + journal: str | None = host_venue.get("display_name") + publisher: str | None = host_venue.get("publisher") + pages = _stringify_pages(biblio) + volume = biblio.get("volume") + number = biblio.get("issue") + published_date = _parse_date(item.get("publication_date")) + return journal, publisher, pages, volume, number, published_date + + +def _extract_comments(item: dict[str, t.Any]) -> str | None: + cited_by_count = item.get("cited_by_count") + if isinstance(cited_by_count, int): + return f"{cited_by_count} citations" + return None diff --git a/searx/settings.yml b/searx/settings.yml index a3455815d..ca5e27df7 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1495,6 +1495,15 @@ engines: require_api_key: false results: JSON + - name: openalex + engine: openalex + shortcut: oa + # https://docs.searxng.org/dev/engines/online/openalex.html + # Recommended by OpenAlex: join the polite pool with an email address + # mailto: "[email protected]" + timeout: 5.0 + disabled: true + - name: openclipart engine: openclipart shortcut: ocl