[feat] engines: add OpenAlex Works engine (#5102)

- Adds a new engine `searx/engines/openalex.py` that integrates the OpenAlex Works API to return scientific paper results using the `paper.html` template. - Uses the official API (no auth required); supports OpenAlex polite pool via `mailto`. - Adds developer docs at `docs/dev/engines/online/openalex.rst`. OpenAlex API reference: https://docs.openalex.org/how-to-use-the-api/api-overview
2025-09-08 11:18:32 +02:00 · 2025-08-24 17:47:30 +05:30 · 2025-08-24 17:47:30 +05:30 · a0ff173799
commit a0ff173799
parent 11ea1a8134
3 changed files with 314 additions and 0 deletions
--- a/searx/engines/openalex.py
+++ b/searx/engines/openalex.py
@ -0,0 +1,205 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# pylint: disable=missing-module-docstring
+#
+# Engine is documented in: docs/dev/engines/online/openalex.rst
+
+from __future__ import annotations
+
+import typing as t
+from datetime import datetime
+from urllib.parse import urlencode
+from searx.result_types import EngineResults
+from searx.extended_types import SXNG_Response
+
+# about
+about = {
+    "website": "https://openalex.org/",
+    "wikidata_id": "Q110718454",
+    "official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview",
+    "use_official_api": True,
+    "require_api_key": False,
+    "results": "JSON",
+}
+
+
+# engine dependent config
+categories = ["science", "scientific publications"]
+paging = True
+search_url = "https://api.openalex.org/works"
+
+# Optional: include your email for OpenAlex polite pool. Can be set from settings.yml
+# engines: - name: openalex; engine: openalex; mailto: "[email protected]"
+mailto = ""
+
+
+def request(query: str, params: dict[str, t.Any]) -> None:
+    # Build OpenAlex query using search parameter and paging
+    args = {
+        "search": query,
+        "page": params["pageno"],
+        # keep result size moderate; OpenAlex default is 25
+        "per-page": 10,
+        # relevance sorting works only with `search`
+        "sort": "relevance_score:desc",
+    }
+
+    # Language filter (expects ISO639-1 like 'fr', 'en')
+    language = params.get("language")
+    filters: list[str] = []
+    if isinstance(language, str) and language != "all":
+        iso2 = language.split("-")[0].split("_")[0]
+        if len(iso2) == 2:
+            filters.append(f"language:{iso2}")
+
+    if filters:
+        args["filter"] = ",".join(filters)
+
+    # include mailto if configured for polite pool (engine module setting)
+    if isinstance(mailto, str) and mailto != "":
+        args["mailto"] = mailto
+
+    params["url"] = f"{search_url}?{urlencode(args)}"
+
+
+def response(resp: SXNG_Response) -> EngineResults:
+    data = resp.json()
+    res = EngineResults()
+
+    for item in data.get("results", []):
+        url, html_url, pdf_url = _extract_links(item)
+        title: str = item.get("title", "")
+        content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or ""
+        authors = _extract_authors(item)
+        journal, publisher, pages, volume, number, published_date = _extract_biblio(item)
+        doi = _doi_to_plain(item.get("doi"))
+        tags = _extract_tags(item) or None
+        comments = _extract_comments(item)
+
+        res.add(
+            res.types.LegacyResult(
+                template="paper.html",
+                url=url,
+                title=title,
+                content=content,
+                journal=journal,
+                publisher=publisher,
+                doi=doi,
+                tags=tags,
+                authors=authors,
+                pdf_url=pdf_url,
+                html_url=html_url,
+                publishedDate=published_date,
+                pages=pages,
+                volume=volume,
+                number=number,
+                type=item.get("type"),
+                comments=comments,
+            )
+        )
+
+    return res
+
+
+def _stringify_pages(biblio: dict[str, t.Any]) -> str | None:
+    first_page = biblio.get("first_page")
+    last_page = biblio.get("last_page")
+    if first_page and last_page:
+        return f"{first_page}-{last_page}"
+    if first_page:
+        return str(first_page)
+    if last_page:
+        return str(last_page)
+    return None
+
+
+def _parse_date(value: str | None) -> datetime | None:
+    if not value:
+        return None
+    # OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD
+    for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
+        try:
+            return datetime.strptime(value, fmt)
+        except ValueError:
+            continue
+    return None
+
+
+def _doi_to_plain(doi_value: str | None) -> str | None:
+    if not doi_value:
+        return None
+    # OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd
+    return doi_value.removeprefix("https://doi.org/")
+
+
+def _reconstruct_abstract(
+    abstract_inverted_index: dict[str, list[int]] | None,
+) -> str | None:
+    # The abstract is returned as an inverted index {token: [positions...]}
+    # Reconstruct by placing tokens at their positions and joining with spaces.
+    if not abstract_inverted_index:
+        return None
+    position_to_token: dict[int, str] = {}
+    max_index = -1
+    for token, positions in abstract_inverted_index.items():
+        for pos in positions:
+            position_to_token[pos] = token
+            max_index = max(max_index, pos)
+    if max_index < 0:
+        return None
+    ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)]
+    # collapse multiple empty tokens
+    text = " ".join(t for t in ordered_tokens if t != "")
+    return text if text != "" else None
+
+
+def _extract_links(item: dict[str, t.Any]) -> tuple[str, str | None, str | None]:
+    primary_location = item.get("primary_location", {})
+    landing_page_url: str | None = primary_location.get("landing_page_url")
+    work_url: str = item.get("id", "")
+    url: str = landing_page_url or work_url
+    open_access = item.get("open_access", {})
+    pdf_url: str | None = primary_location.get("pdf_url") or open_access.get("oa_url")
+    html_url: str | None = landing_page_url
+    return url, html_url, pdf_url
+
+
+def _extract_authors(item: dict[str, t.Any]) -> list[str]:
+    authors: list[str] = []
+    for auth in item.get("authorships", []):
+        if not auth:
+            continue
+        author_obj = auth.get("author", {})
+        display_name = author_obj.get("display_name")
+        if isinstance(display_name, str) and display_name != "":
+            authors.append(display_name)
+    return authors
+
+
+def _extract_tags(item: dict[str, t.Any]) -> list[str]:
+    tags: list[str] = []
+    for c in item.get("concepts", []):
+        name = (c or {}).get("display_name")
+        if isinstance(name, str) and name != "":
+            tags.append(name)
+    return tags
+
+
+def _extract_biblio(
+    item: dict[str, t.Any],
+) -> tuple[str | None, str | None, str | None, str | None, str | None, datetime | None]:
+    host_venue = item.get("host_venue", {})
+    biblio = item.get("biblio", {})
+    journal: str | None = host_venue.get("display_name")
+    publisher: str | None = host_venue.get("publisher")
+    pages = _stringify_pages(biblio)
+    volume = biblio.get("volume")
+    number = biblio.get("issue")
+    published_date = _parse_date(item.get("publication_date"))
+    return journal, publisher, pages, volume, number, published_date
+
+
+def _extract_comments(item: dict[str, t.Any]) -> str | None:
+    cited_by_count = item.get("cited_by_count")
+    if isinstance(cited_by_count, int):
+        return f"{cited_by_count} citations"
+    return None