[mod] typification of SearXNG: add new result type Code

This patch adds a new result type: Code - Python class: searx/result_types/code.py - Jinja template: searx/templates/simple/result_templates/code.html - CSS (less) client/simple/src/less/result_types/code.less Signed-of-by: Markus Heiser <markus.heiser@darmarIT.de>
2025-09-02 08:18:35 +02:00 · 2025-08-21 17:57:58 +02:00 · 2025-08-21 17:57:58 +02:00 · 9ac9c8c4f5
commit 9ac9c8c4f5
parent b8085d27ac
10 changed files with 306 additions and 163 deletions
--- a/docs/conf.py
+++ b/docs/conf.py
@ -150,6 +150,7 @@ intersphinx_mapping = {
    "linuxdoc" : ("https://return42.github.io/linuxdoc/", None),
    "sphinx" : ("https://www.sphinx-doc.org/en/master/", None),
    "valkey": ('https://valkey-py.readthedocs.io/en/stable/', None),
+    "pygments": ("https://pygments.org/", None),
 }

 issues_github_path = "searxng/searxng"
--- a/docs/dev/result_types/main/code.rst
+++ b/docs/dev/result_types/main/code.rst
@ -0,0 +1,7 @@
+.. _result_types.code:
+
+============
+Code Results
+============
+
+.. automodule:: searx.result_types.code
--- a/docs/dev/result_types/main_result.rst
+++ b/docs/dev/result_types/main_result.rst
@ -15,6 +15,7 @@ following types have been implemented so far ..

   main/mainresult
   main/keyvalue
+   main/code

 The :ref:`LegacyResult <LegacyResult>` is used internally for the results that
 have not yet been typed.  The templates can be used as orientation until the
@ -27,6 +28,5 @@ final typing is complete.
 - :ref:`template map`
 - :ref:`template paper`
 - :ref:`template packages`
- :ref:`template code`
 - :ref:`template files`
 - :ref:`template products`
--- a/docs/dev/templates.rst
+++ b/docs/dev/templates.rst
@ -469,33 +469,6 @@ links : :py:class:`dict`
  Additional links in the form of ``{'link_name': 'http://example.com'}``


-.. _template code:
-
-``code.html``
-------------
-
-Displays result fields from:
-
- :ref:`macro result_header` and
- :ref:`macro result_sub_header`
-
-Additional fields used in the :origin:`code.html
-<searx/templates/simple/result_templates/code.html>`:
-
-content :  :py:class:`str`
-  Description of the code fragment.
-
-codelines : ``[line1, line2, ...]``
-  Lines of the code fragment.
-
-code_language : :py:class:`str`
-  Name of the code language, the value is passed to
-  :py:obj:`pygments.lexers.get_lexer_by_name`.
-
-repository : :py:class:`str`
-  URL of the repository of the code fragment.
-
-
 .. _template files:

 ``files.html``
--- a/searx/engines/github_code.py
+++ b/searx/engines/github_code.py
@ -68,10 +68,8 @@ code blocks in a single file might be returned from the API).
 from __future__ import annotations

 import typing as t
-from urllib.parse import urlencode, urlparse
+from urllib.parse import urlencode

-from pygments.lexers import guess_lexer_for_filename
-from pygments.util import ClassNotFound
 from searx.result_types import EngineResults
 from searx.extended_types import SXNG_Response
 from searx.network import raise_for_httperror
@ -162,26 +160,10 @@ def request(query: str, params: dict[str, t.Any]) -> None:
    params['raise_for_httperror'] = False


-def get_code_language_name(filename: str, code_snippet: str) -> str | None:
-    """Returns a code language name by pulling information from the filename if
-    possible otherwise by scanning the passed code snippet. In case there is any
-    parsing error just default to no syntax highlighting."""
-    try:
-        lexer = guess_lexer_for_filename(filename, _text=code_snippet)
-        if lexer is None:
-            return None
-        code_name_aliases = lexer.aliases
-        if len(code_name_aliases) == 0:
-            return None
-        return code_name_aliases[0]
-    except ClassNotFound:
-        return None
-
-
 def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]:
    """
    Iterate over multiple possible matches, for each extract a code fragment.
-    GitHub additionally sends context for _word_ highlights; pygments supports
+    Github additionally sends context for _word_ highlights; pygments supports
    highlighting lines, as such we calculate which lines to highlight while
    traversing the text.
    """
@ -231,18 +213,18 @@ def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[i


 def response(resp: SXNG_Response) -> EngineResults:
-    results = EngineResults()
+    res = EngineResults()

    if resp.status_code == 422:
        # on a invalid search term the status code 422 "Unprocessable Content"
        # is returned / e.g. search term is "user: foo" instead "user:foo"
-        return results
+        return res
    # raise for other errors
    raise_for_httperror(resp)

    for item in resp.json().get('items', []):
-        repo = item['repository']
-        text_matches = item['text_matches']
+        repo: dict[str, str] = item['repository']  # pyright: ignore[reportAny]
+        text_matches: list[dict[str, str]] = item['text_matches']  # pyright: ignore[reportAny]
        # ensure picking only the code contents in the blob
        code_matches = [
            match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content"
@ -251,22 +233,18 @@ def response(resp: SXNG_Response) -> EngineResults:
        if not ghc_highlight_matching_lines:
            highlighted_lines_index: set[int] = set()

-        code_snippet = "\n".join(lines)
+        res.add(
+            res.types.Code(
+                url=item["html_url"],  # pyright: ignore[reportAny]
+                title=f"{repo['full_name']} · {item['name']}",
+                filename=f"{item['path']}",
+                content=repo['description'],
+                repository=repo['html_url'],
+                codelines=[(i + 1, line) for (i, line) in enumerate(lines)],
+                hl_lines=highlighted_lines_index,
+                strip_whitespace=ghc_strip_whitespace,
+                strip_new_lines=ghc_strip_new_lines,
+            )
+        )

-        kwargs: dict[str, t.Any] = {
-            'template': 'code.html',
-            'url': item['html_url'],
-            'title': f"{repo['full_name']} · {item['path']}",
-            'content': repo['description'],
-            'repository': repo['html_url'],
-            'codelines': [(i + 1, line) for (i, line) in enumerate(lines)],
-            'hl_lines': highlighted_lines_index,
-            'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet),
-            # important to set for highlighing
-            'strip_whitespace': ghc_strip_whitespace,
-            'strip_new_lines': ghc_strip_new_lines,
-            'parsed_url': urlparse(item['html_url']),
-        }
-        results.add(results.types.LegacyResult(**kwargs))
-
-    return results
+    return res
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@ -1,79 +1,62 @@
-# SPDX-License-Identifier: AGPL-3.0-or-later
-"""Searchcode (IT)
+"""Searchcode (IT)"""

-"""
+from __future__ import annotations
+
+import typing as t

-from json import loads
 from urllib.parse import urlencode

+from searx.result_types import EngineResults
+from searx.extended_types import SXNG_Response
+
 # about
 about = {
-    "website": 'https://searchcode.com/',
+    "website": "https://searchcode.com/",
    "wikidata_id": None,
-    "official_api_documentation": 'https://searchcode.com/api/',
+    "official_api_documentation": "https://searchcode.com/api/",
    "use_official_api": True,
    "require_api_key": False,
-    "results": 'JSON',
+    "results": "JSON",
 }

 # engine dependent config
-categories = ['it']
-search_api = 'https://searchcode.com/api/codesearch_I/?'
-
-# special code-endings which are not recognised by the file ending
-code_endings = {'cs': 'c#', 'h': 'c', 'hpp': 'cpp', 'cxx': 'cpp'}
+categories = ["it"]
+search_api = "https://searchcode.com/api/codesearch_I/?"

 # paging is broken in searchcode.com's API .. not sure it will ever been fixed
 # paging = True


-def request(query, params):
-    args = urlencode(
-        {
-            'q': query,
-            # paging is broken in searchcode.com's API
-            # 'p': params['pageno'] - 1,
-            # 'per_page': 10,
-        }
-    )
-    params['url'] = search_api + args
-    logger.debug("query_url --> %s", params['url'])
-    return params
+def request(query: str, params: dict[str, t.Any]) -> None:
+    args = {
+        "q": query,
+        # paging is broken in searchcode.com's API
+        # "p": params["pageno"] - 1,
+        # "per_page": 10,
+    }
+
+    params["url"] = search_api + urlencode(args)
+    logger.debug("query_url --> %s", params["url"])


-def response(resp):
-    results = []
-
-    search_results = loads(resp.text)
+def response(resp: SXNG_Response) -> EngineResults:
+    res = EngineResults()

    # parse results
-    for result in search_results.get('results', []):
-        href = result['url']
-        title = "" + result['name'] + " - " + result['filename']
-        repo = result['repo']
-
+    for result in resp.json().get("results", []):
        lines = {}
-        for line, code in result['lines'].items():
+        for line, code in result["lines"].items():
            lines[int(line)] = code

-        code_language = code_endings.get(
-            result['filename'].split('.')[-1].lower(), result['filename'].split('.')[-1].lower()
+        res.add(
+            res.types.Code(
+                url=result["url"],
+                title=f'{result["name"]} - {result["filename"]}',
+                repository=result["repo"],
+                filename=result["filename"],
+                codelines=sorted(lines.items()),
+                strip_whitespace=True,
+            )
        )

-        # append result
-        results.append(
-            {
-                'url': href,
-                'title': title,
-                'content': '',
-                'repository': repo,
-                'codelines': sorted(lines.items()),
-                'code_language': code_language,
-                'template': 'code.html',
-                'strip_whitespace': True,
-                'strip_new_lines': True,
-            }
-        )
-
-    # return results
-    return results
+    return res
--- a/searx/result_types/init.py
+++ b/searx/result_types/init.py
@ -13,25 +13,38 @@

 from __future__ import annotations

-__all__ = ["Result", "MainResult", "KeyValue", "EngineResults", "AnswerSet", "Answer", "Translations", "WeatherAnswer"]
+__all__ = [
+    "Result",
+    "MainResult",
+    "KeyValue",
+    "EngineResults",
+    "AnswerSet",
+    "Answer",
+    "Translations",
+    "WeatherAnswer",
+    "Code",
+]

+import typing as t
 import abc

-from searx import enginelib
-
 from ._base import Result, MainResult, LegacyResult
 from .answer import AnswerSet, Answer, Translations, WeatherAnswer
 from .keyvalue import KeyValue
+from .code import Code


-class ResultList(list, abc.ABC):
+class ResultList(list, abc.ABC):  # pyright: ignore[reportMissingTypeArgument]
    """Base class of all result lists (abstract)."""

+    @t.final
    class types:  # pylint: disable=invalid-name
-        """The collection of result types (which have already been implemented)."""
+        """The collection of result types (which have already been
+        implemented)."""

        Answer = Answer
        KeyValue = KeyValue
+        Code = Code
        MainResult = MainResult
        Result = Result
        Translations = Translations
@ -42,11 +55,11 @@ class ResultList(list, abc.ABC):

    def __init__(self):
        # pylint: disable=useless-parent-delegation
-        super().__init__()
+        super().__init__()  # pyright: ignore[reportUnknownMemberType]

    def add(self, result: Result | LegacyResult):
        """Add a :py:`Result` item to the result list."""
-        self.append(result)
+        self.append(result)  # pyright: ignore[reportUnknownMemberType]


 class EngineResults(ResultList):
--- a/searx/result_types/code.py
+++ b/searx/result_types/code.py
@ -0,0 +1,185 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Typification of the *code* results.  Results of this type are rendered in
+the :origin:`code.html <searx/templates/simple/result_templates/code.html>`
+template.  For highlighting the code passages, Pygments is used.
+
+.. _Pygments:  https://pygments.org
+
+----
+
+.. autoclass:: Code
+   :members:
+   :show-inheritance:
+
+"""
+# pylint: disable=too-few-public-methods, disable=invalid-name
+
+from __future__ import annotations
+
+__all__ = ["Code"]
+
+import typing as t
+
+from pygments import highlight  # pyright: ignore[reportUnknownVariableType]
+from pygments.lexers._mapping import LEXERS  # pyright: ignore[reportMissingTypeStubs]
+from pygments.lexers import guess_lexer, get_lexer_by_name, guess_lexer_for_filename
+from pygments.util import ClassNotFound
+from pygments.formatters import HtmlFormatter  # pylint: disable=no-name-in-module
+
+from ._base import MainResult
+
+
+_pygments_languages: list[str] = []
+
+
+def is_valid_language(code_language: str) -> bool:
+    """Checks if the specified ``code_language`` is known in Pygments."""
+    if not _pygments_languages:
+        for l in LEXERS.values():
+            # l[2] is the tuple with the alias names
+            for alias_name in l[2]:
+                _pygments_languages.append(alias_name.lower())
+    return code_language.lower() in _pygments_languages
+
+
+@t.final
+class Code(MainResult, kw_only=True):
+    """Simple table view which maps *key* names (first col) to *values*
+    (second col)."""
+
+    template: str = "code.html"
+
+    repository: str | None = None
+    """A link related to a repository related to the *result*"""
+
+    codelines: list[tuple[int, str]] = []
+    """A list of two digit tuples where the first item is the line number and
+    the second item is the code line."""
+
+    hl_lines: set[int] = set()
+    """A list of line numbers to highlight"""
+
+    code_language: str = "<guess>"
+    """Pygment's short name of the lexer, e.g. ``text`` for the
+    :py:obj:`pygments.lexers.special.TextLexer`.  For a list of available
+    languages consult: `Pygments languages`_.  If the language is not in this
+    list, a :py:obj:`ValueError` is raised.
+
+    The default is ``<guess>`` which has a special meaning;
+
+    - If :py:obj:`Code.filename` is set, Pygment's factory method
+      :py:obj:`pygments.lexers.guess_lexer_for_filename` is used to determine
+      the language of the ``codelines``.
+
+    - else Pygment's :py:obj:`pygments.lexers.guess_lexer` factory is used.
+
+    In case the language can't be detected, the fallback is ``text``.
+
+    .. _Pygments languages:  https://pygments.org/languages/
+    """
+
+    filename: str | None = None
+    """Optional file name, can help to ``<guess>`` the language of the code (in
+    case of ambiguous short code examples).  If :py:obj:`Code.title` is not set,
+    its default is the filename."""
+
+    strip_new_lines: bool = True
+    """Strip leading and trailing newlines for each returned fragment.
+    Single file might return multiple code fragments.
+    """
+
+    strip_whitespace: bool = False
+    """Strip all leading and trailing whitespace for each returned fragment.
+    Single file might return multiple code fragments. Enabling this might break
+    code indentation.
+    """
+
+    def __post_init__(self):
+        super().__post_init__()
+
+        if not self.title and self.filename:
+            self.title = self.filename
+
+        if self.code_language != "<guess>" and not is_valid_language(self.code_language):
+            raise ValueError(f"unknown code_language: {self.code_language}")
+
+    def __hash__(self):
+        """The hash value is build up from URL and code lines. :py:obj:`Code
+        <Result.__eq__>` objects are equal, when the hash values of both objects
+        are equal.
+        """
+        return hash(f"{self.url} {self.codelines}")
+
+    def get_lexer(self):
+        if self.code_language != "<guess>":
+            return get_lexer_by_name(self.code_language)
+
+        src_code = "\n".join([l[1] for l in self.codelines])
+        if self.filename:
+            try:
+                return guess_lexer_for_filename(self.filename, src_code)
+            except ClassNotFound:
+                pass
+        try:
+            return guess_lexer(src_code)
+        except ClassNotFound:
+            pass
+        return get_lexer_by_name("text")
+
+    def HTML(self, **options) -> str:  # pyright: ignore[reportUnknownParameterType, reportMissingParameterType]
+        """Rendered HTML, additional options are accepted, for more details have
+        a look at HtmlFormatter_.
+
+        .. _HtmlFormatter: https://pygments.org/docs/formatters/#HtmlFormatter
+        """
+        lexer = self.get_lexer()
+
+        line_no: int = 0  # current line number
+        code_block_start: int = 0  # line where the current code block starts
+        code_block_end: int | None = None  # line where the current code ends
+        code_block: list[str] = []  # lines of the current code block
+        html_code_blocks: list[str] = []  # HTML representation of all code blocks
+
+        def _render(**kwargs):  # pyright: ignore[reportUnknownParameterType, reportMissingParameterType]
+            for k, default in [
+                ("linenos", "inline"),
+                ("linenostart", code_block_start),
+                ("cssclass", "code-highlight"),
+                ("hl_lines", [hl - code_block_start + 1 for hl in self.hl_lines]),
+            ]:
+                kwargs[k] = kwargs.get(k, default)  # pyright: ignore[reportUnknownMemberType]
+
+            # Wrap the code inside <pre> blocks using <code>, as recommended by
+            # the HTML5 specification (default is False).  Do we need this?
+            kwargs["wrapcode"] = kwargs.get("wrapcode", True)
+
+            html_code_blocks.append(
+                highlight(
+                    "\n".join(code_block),
+                    lexer,
+                    HtmlFormatter(**kwargs),  # pyright: ignore[reportUnknownArgumentType]
+                )
+            )
+
+        for line_no, code_line in self.codelines:
+            if code_block_end is None:
+                # initial start condition
+                code_block_start = line_no
+
+            if code_block_end is not None and code_block_end + 1 != line_no:
+                # new code block is detected, render current code block
+                _render(**options)  # pyright: ignore[reportUnknownArgumentType]
+                # reset conditions for next code block, which first line is the
+                # current code line
+                code_block = [code_line]
+                code_block_start = line_no
+                code_block_end = line_no
+                continue
+
+            # add line to the current code block and update last line n
+            code_block.append(code_line)
+            code_block_end = line_no
+
+        # highlight (last) code block
+        _render(**options)  # pyright: ignore[reportUnknownArgumentType]
+        return "\n".join(html_code_blocks)
--- a/searx/templates/simple/result_templates/code.html
+++ b/searx/templates/simple/result_templates/code.html
@ -10,22 +10,28 @@
 {%- endif -%}
 {%- if result.repository -%}
  <p class="content">{{- '' -}}
-    {{ _('repo') }}: {{- ' ' -}}
+    {{ _('Repository') }}: {{- ' ' -}}
    <a href="{{ result.repository|safe }}"{{- ' ' -}}
-       {% if results_on_new_tab %}
-         target="_blank" {{- ' ' -}}
-         rel="noopener noreferrer"
-       {%- else -%}
-         rel="noreferrer"
-       {%- endif -%}
-       >
-       {{- result.repository -}}
+      {% if results_on_new_tab %}
+      target="_blank" {{- ' ' -}}
+      rel="noopener noreferrer"
+      {%- else -%}
+      rel="noreferrer"
+      {%- endif -%}
+    >
+      {{- result.repository -}}
    </a>{{- '' -}}
  </p>
 {%- endif -%}

+{%- if result.filename %}
+  <p class="content">
+    {{ _('Filename') }}: {{ result.filename|safe }}
+  </p>
+{% endif -%}
+
 <div dir="ltr" class="codelines">
-    {{- result.codelines|code_highlighter(result.code_language, result.hl_lines, result.strip_whitespace, result.strip_new_lines)|safe -}}
+    {{- result.HTML()|safe -}}
 </div>

 {{- result_sub_footer(result) -}}
--- a/tests/unit/test_engine_github_code.py
+++ b/tests/unit/test_engine_github_code.py
@ -142,29 +142,26 @@ class GithubCodeTests(SearxTestCase):
        results = self.ghc.response(response)
        expected_results = EngineResults()
        expected_results.add(
-            expected_results.types.LegacyResult(
-                **{
-                    'url': "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md",
-                    'title': "folke/dot · TODO.md",
-                    'content': "☕️   My Dot Files",
-                    'repository': "https://github.com/folke/dot",
-                    'codelines': [
-                        (1, "- [x] windows picker"),
-                        (2, "- [x] toggle cwd / root (LazyVim)"),
-                        (3, "- [x] dynamic workspace symbol"),
-                        (4, "- [x] smart stops working after custom"),
-                        (5, "- [x] edit in empty buffer"),
-                        (6, "- [x] support toggling line nr for preview"),
-                    ],
-                    'hl_lines': {2, 5, 6},
-                    'code_language': "markdown",
-                    'template': 'code.html',
-                    'strip_whitespace': False,
-                    'strip_new_lines': True,
-                    'parsed_url': urlparse(
-                        "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md"
-                    ),
-                }
+            expected_results.types.Code(
+                url="https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md",
+                title="folke/dot · TODO.md",
+                content="☕️   My Dot Files",
+                repository="https://github.com/folke/dot",
+                codelines=[
+                    (1, "- [x] windows picker"),
+                    (2, "- [x] toggle cwd / root (LazyVim)"),
+                    (3, "- [x] dynamic workspace symbol"),
+                    (4, "- [x] smart stops working after custom"),
+                    (5, "- [x] edit in empty buffer"),
+                    (6, "- [x] support toggling line nr for preview"),
+                ],
+                hl_lines={2, 5, 6},
+                code_language="markdown",
+                strip_whitespace=False,
+                strip_new_lines=True,
+                parsed_url=urlparse(
+                    "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md"
+                ),
            )
        )
        self.assertEqual(results, expected_results)