diff --git a/docs/conf.py b/docs/conf.py index a7221e48b..2d730f58c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -150,6 +150,7 @@ intersphinx_mapping = { "linuxdoc" : ("https://return42.github.io/linuxdoc/", None), "sphinx" : ("https://www.sphinx-doc.org/en/master/", None), "valkey": ('https://valkey-py.readthedocs.io/en/stable/', None), + "pygments": ("https://pygments.org/", None), } issues_github_path = "searxng/searxng" diff --git a/docs/dev/result_types/main/code.rst b/docs/dev/result_types/main/code.rst new file mode 100644 index 000000000..399cbd26e --- /dev/null +++ b/docs/dev/result_types/main/code.rst @@ -0,0 +1,7 @@ +.. _result_types.code: + +============ +Code Results +============ + +.. automodule:: searx.result_types.code diff --git a/docs/dev/result_types/main_result.rst b/docs/dev/result_types/main_result.rst index a76ed5e88..f072ea757 100644 --- a/docs/dev/result_types/main_result.rst +++ b/docs/dev/result_types/main_result.rst @@ -15,6 +15,7 @@ following types have been implemented so far .. main/mainresult main/keyvalue + main/code The :ref:`LegacyResult ` is used internally for the results that have not yet been typed. The templates can be used as orientation until the @@ -27,6 +28,5 @@ final typing is complete. - :ref:`template map` - :ref:`template paper` - :ref:`template packages` -- :ref:`template code` - :ref:`template files` - :ref:`template products` diff --git a/docs/dev/templates.rst b/docs/dev/templates.rst index 3633eb2ef..e2fa879c8 100644 --- a/docs/dev/templates.rst +++ b/docs/dev/templates.rst @@ -469,33 +469,6 @@ links : :py:class:`dict` Additional links in the form of ``{'link_name': 'http://example.com'}`` -.. _template code: - -``code.html`` -------------- - -Displays result fields from: - -- :ref:`macro result_header` and -- :ref:`macro result_sub_header` - -Additional fields used in the :origin:`code.html -`: - -content : :py:class:`str` - Description of the code fragment. - -codelines : ``[line1, line2, ...]`` - Lines of the code fragment. - -code_language : :py:class:`str` - Name of the code language, the value is passed to - :py:obj:`pygments.lexers.get_lexer_by_name`. - -repository : :py:class:`str` - URL of the repository of the code fragment. - - .. _template files: ``files.html`` diff --git a/searx/engines/github_code.py b/searx/engines/github_code.py index 4bafe9c0d..55060b8de 100644 --- a/searx/engines/github_code.py +++ b/searx/engines/github_code.py @@ -68,10 +68,8 @@ code blocks in a single file might be returned from the API). from __future__ import annotations import typing as t -from urllib.parse import urlencode, urlparse +from urllib.parse import urlencode -from pygments.lexers import guess_lexer_for_filename -from pygments.util import ClassNotFound from searx.result_types import EngineResults from searx.extended_types import SXNG_Response from searx.network import raise_for_httperror @@ -162,26 +160,10 @@ def request(query: str, params: dict[str, t.Any]) -> None: params['raise_for_httperror'] = False -def get_code_language_name(filename: str, code_snippet: str) -> str | None: - """Returns a code language name by pulling information from the filename if - possible otherwise by scanning the passed code snippet. In case there is any - parsing error just default to no syntax highlighting.""" - try: - lexer = guess_lexer_for_filename(filename, _text=code_snippet) - if lexer is None: - return None - code_name_aliases = lexer.aliases - if len(code_name_aliases) == 0: - return None - return code_name_aliases[0] - except ClassNotFound: - return None - - def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]: """ Iterate over multiple possible matches, for each extract a code fragment. - GitHub additionally sends context for _word_ highlights; pygments supports + Github additionally sends context for _word_ highlights; pygments supports highlighting lines, as such we calculate which lines to highlight while traversing the text. """ @@ -231,18 +213,18 @@ def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[i def response(resp: SXNG_Response) -> EngineResults: - results = EngineResults() + res = EngineResults() if resp.status_code == 422: # on a invalid search term the status code 422 "Unprocessable Content" # is returned / e.g. search term is "user: foo" instead "user:foo" - return results + return res # raise for other errors raise_for_httperror(resp) for item in resp.json().get('items', []): - repo = item['repository'] - text_matches = item['text_matches'] + repo: dict[str, str] = item['repository'] # pyright: ignore[reportAny] + text_matches: list[dict[str, str]] = item['text_matches'] # pyright: ignore[reportAny] # ensure picking only the code contents in the blob code_matches = [ match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content" @@ -251,22 +233,18 @@ def response(resp: SXNG_Response) -> EngineResults: if not ghc_highlight_matching_lines: highlighted_lines_index: set[int] = set() - code_snippet = "\n".join(lines) + res.add( + res.types.Code( + url=item["html_url"], # pyright: ignore[reportAny] + title=f"{repo['full_name']} · {item['name']}", + filename=f"{item['path']}", + content=repo['description'], + repository=repo['html_url'], + codelines=[(i + 1, line) for (i, line) in enumerate(lines)], + hl_lines=highlighted_lines_index, + strip_whitespace=ghc_strip_whitespace, + strip_new_lines=ghc_strip_new_lines, + ) + ) - kwargs: dict[str, t.Any] = { - 'template': 'code.html', - 'url': item['html_url'], - 'title': f"{repo['full_name']} · {item['path']}", - 'content': repo['description'], - 'repository': repo['html_url'], - 'codelines': [(i + 1, line) for (i, line) in enumerate(lines)], - 'hl_lines': highlighted_lines_index, - 'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet), - # important to set for highlighing - 'strip_whitespace': ghc_strip_whitespace, - 'strip_new_lines': ghc_strip_new_lines, - 'parsed_url': urlparse(item['html_url']), - } - results.add(results.types.LegacyResult(**kwargs)) - - return results + return res diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 2196b0ad2..c0a6550a0 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -1,79 +1,62 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -"""Searchcode (IT) +"""Searchcode (IT)""" -""" +from __future__ import annotations + +import typing as t -from json import loads from urllib.parse import urlencode +from searx.result_types import EngineResults +from searx.extended_types import SXNG_Response + # about about = { - "website": 'https://searchcode.com/', + "website": "https://searchcode.com/", "wikidata_id": None, - "official_api_documentation": 'https://searchcode.com/api/', + "official_api_documentation": "https://searchcode.com/api/", "use_official_api": True, "require_api_key": False, - "results": 'JSON', + "results": "JSON", } # engine dependent config -categories = ['it'] -search_api = 'https://searchcode.com/api/codesearch_I/?' - -# special code-endings which are not recognised by the file ending -code_endings = {'cs': 'c#', 'h': 'c', 'hpp': 'cpp', 'cxx': 'cpp'} +categories = ["it"] +search_api = "https://searchcode.com/api/codesearch_I/?" # paging is broken in searchcode.com's API .. not sure it will ever been fixed # paging = True -def request(query, params): - args = urlencode( - { - 'q': query, - # paging is broken in searchcode.com's API - # 'p': params['pageno'] - 1, - # 'per_page': 10, - } - ) - params['url'] = search_api + args - logger.debug("query_url --> %s", params['url']) - return params +def request(query: str, params: dict[str, t.Any]) -> None: + args = { + "q": query, + # paging is broken in searchcode.com's API + # "p": params["pageno"] - 1, + # "per_page": 10, + } + + params["url"] = search_api + urlencode(args) + logger.debug("query_url --> %s", params["url"]) -def response(resp): - results = [] - - search_results = loads(resp.text) +def response(resp: SXNG_Response) -> EngineResults: + res = EngineResults() # parse results - for result in search_results.get('results', []): - href = result['url'] - title = "" + result['name'] + " - " + result['filename'] - repo = result['repo'] - + for result in resp.json().get("results", []): lines = {} - for line, code in result['lines'].items(): + for line, code in result["lines"].items(): lines[int(line)] = code - code_language = code_endings.get( - result['filename'].split('.')[-1].lower(), result['filename'].split('.')[-1].lower() + res.add( + res.types.Code( + url=result["url"], + title=f'{result["name"]} - {result["filename"]}', + repository=result["repo"], + filename=result["filename"], + codelines=sorted(lines.items()), + strip_whitespace=True, + ) ) - # append result - results.append( - { - 'url': href, - 'title': title, - 'content': '', - 'repository': repo, - 'codelines': sorted(lines.items()), - 'code_language': code_language, - 'template': 'code.html', - 'strip_whitespace': True, - 'strip_new_lines': True, - } - ) - - # return results - return results + return res diff --git a/searx/result_types/__init__.py b/searx/result_types/__init__.py index 6d47d3a4f..f4b37df07 100644 --- a/searx/result_types/__init__.py +++ b/searx/result_types/__init__.py @@ -13,25 +13,38 @@ from __future__ import annotations -__all__ = ["Result", "MainResult", "KeyValue", "EngineResults", "AnswerSet", "Answer", "Translations", "WeatherAnswer"] +__all__ = [ + "Result", + "MainResult", + "KeyValue", + "EngineResults", + "AnswerSet", + "Answer", + "Translations", + "WeatherAnswer", + "Code", +] +import typing as t import abc -from searx import enginelib - from ._base import Result, MainResult, LegacyResult from .answer import AnswerSet, Answer, Translations, WeatherAnswer from .keyvalue import KeyValue +from .code import Code -class ResultList(list, abc.ABC): +class ResultList(list, abc.ABC): # pyright: ignore[reportMissingTypeArgument] """Base class of all result lists (abstract).""" + @t.final class types: # pylint: disable=invalid-name - """The collection of result types (which have already been implemented).""" + """The collection of result types (which have already been + implemented).""" Answer = Answer KeyValue = KeyValue + Code = Code MainResult = MainResult Result = Result Translations = Translations @@ -42,11 +55,11 @@ class ResultList(list, abc.ABC): def __init__(self): # pylint: disable=useless-parent-delegation - super().__init__() + super().__init__() # pyright: ignore[reportUnknownMemberType] def add(self, result: Result | LegacyResult): """Add a :py:`Result` item to the result list.""" - self.append(result) + self.append(result) # pyright: ignore[reportUnknownMemberType] class EngineResults(ResultList): diff --git a/searx/result_types/code.py b/searx/result_types/code.py new file mode 100644 index 000000000..5350d74f3 --- /dev/null +++ b/searx/result_types/code.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Typification of the *code* results. Results of this type are rendered in +the :origin:`code.html ` +template. For highlighting the code passages, Pygments is used. + +.. _Pygments: https://pygments.org + +---- + +.. autoclass:: Code + :members: + :show-inheritance: + +""" +# pylint: disable=too-few-public-methods, disable=invalid-name + +from __future__ import annotations + +__all__ = ["Code"] + +import typing as t + +from pygments import highlight # pyright: ignore[reportUnknownVariableType] +from pygments.lexers._mapping import LEXERS # pyright: ignore[reportMissingTypeStubs] +from pygments.lexers import guess_lexer, get_lexer_by_name, guess_lexer_for_filename +from pygments.util import ClassNotFound +from pygments.formatters import HtmlFormatter # pylint: disable=no-name-in-module + +from ._base import MainResult + + +_pygments_languages: list[str] = [] + + +def is_valid_language(code_language: str) -> bool: + """Checks if the specified ``code_language`` is known in Pygments.""" + if not _pygments_languages: + for l in LEXERS.values(): + # l[2] is the tuple with the alias names + for alias_name in l[2]: + _pygments_languages.append(alias_name.lower()) + return code_language.lower() in _pygments_languages + + +@t.final +class Code(MainResult, kw_only=True): + """Simple table view which maps *key* names (first col) to *values* + (second col).""" + + template: str = "code.html" + + repository: str | None = None + """A link related to a repository related to the *result*""" + + codelines: list[tuple[int, str]] = [] + """A list of two digit tuples where the first item is the line number and + the second item is the code line.""" + + hl_lines: set[int] = set() + """A list of line numbers to highlight""" + + code_language: str = "" + """Pygment's short name of the lexer, e.g. ``text`` for the + :py:obj:`pygments.lexers.special.TextLexer`. For a list of available + languages consult: `Pygments languages`_. If the language is not in this + list, a :py:obj:`ValueError` is raised. + + The default is ```` which has a special meaning; + + - If :py:obj:`Code.filename` is set, Pygment's factory method + :py:obj:`pygments.lexers.guess_lexer_for_filename` is used to determine + the language of the ``codelines``. + + - else Pygment's :py:obj:`pygments.lexers.guess_lexer` factory is used. + + In case the language can't be detected, the fallback is ``text``. + + .. _Pygments languages: https://pygments.org/languages/ + """ + + filename: str | None = None + """Optional file name, can help to ```` the language of the code (in + case of ambiguous short code examples). If :py:obj:`Code.title` is not set, + its default is the filename.""" + + strip_new_lines: bool = True + """Strip leading and trailing newlines for each returned fragment. + Single file might return multiple code fragments. + """ + + strip_whitespace: bool = False + """Strip all leading and trailing whitespace for each returned fragment. + Single file might return multiple code fragments. Enabling this might break + code indentation. + """ + + def __post_init__(self): + super().__post_init__() + + if not self.title and self.filename: + self.title = self.filename + + if self.code_language != "" and not is_valid_language(self.code_language): + raise ValueError(f"unknown code_language: {self.code_language}") + + def __hash__(self): + """The hash value is build up from URL and code lines. :py:obj:`Code + ` objects are equal, when the hash values of both objects + are equal. + """ + return hash(f"{self.url} {self.codelines}") + + def get_lexer(self): + if self.code_language != "": + return get_lexer_by_name(self.code_language) + + src_code = "\n".join([l[1] for l in self.codelines]) + if self.filename: + try: + return guess_lexer_for_filename(self.filename, src_code) + except ClassNotFound: + pass + try: + return guess_lexer(src_code) + except ClassNotFound: + pass + return get_lexer_by_name("text") + + def HTML(self, **options) -> str: # pyright: ignore[reportUnknownParameterType, reportMissingParameterType] + """Rendered HTML, additional options are accepted, for more details have + a look at HtmlFormatter_. + + .. _HtmlFormatter: https://pygments.org/docs/formatters/#HtmlFormatter + """ + lexer = self.get_lexer() + + line_no: int = 0 # current line number + code_block_start: int = 0 # line where the current code block starts + code_block_end: int | None = None # line where the current code ends + code_block: list[str] = [] # lines of the current code block + html_code_blocks: list[str] = [] # HTML representation of all code blocks + + def _render(**kwargs): # pyright: ignore[reportUnknownParameterType, reportMissingParameterType] + for k, default in [ + ("linenos", "inline"), + ("linenostart", code_block_start), + ("cssclass", "code-highlight"), + ("hl_lines", [hl - code_block_start + 1 for hl in self.hl_lines]), + ]: + kwargs[k] = kwargs.get(k, default) # pyright: ignore[reportUnknownMemberType] + + # Wrap the code inside
 blocks using , as recommended by
+            # the HTML5 specification (default is False).  Do we need this?
+            kwargs["wrapcode"] = kwargs.get("wrapcode", True)
+
+            html_code_blocks.append(
+                highlight(
+                    "\n".join(code_block),
+                    lexer,
+                    HtmlFormatter(**kwargs),  # pyright: ignore[reportUnknownArgumentType]
+                )
+            )
+
+        for line_no, code_line in self.codelines:
+            if code_block_end is None:
+                # initial start condition
+                code_block_start = line_no
+
+            if code_block_end is not None and code_block_end + 1 != line_no:
+                # new code block is detected, render current code block
+                _render(**options)  # pyright: ignore[reportUnknownArgumentType]
+                # reset conditions for next code block, which first line is the
+                # current code line
+                code_block = [code_line]
+                code_block_start = line_no
+                code_block_end = line_no
+                continue
+
+            # add line to the current code block and update last line n
+            code_block.append(code_line)
+            code_block_end = line_no
+
+        # highlight (last) code block
+        _render(**options)  # pyright: ignore[reportUnknownArgumentType]
+        return "\n".join(html_code_blocks)
diff --git a/searx/templates/simple/result_templates/code.html b/searx/templates/simple/result_templates/code.html
index bcde94358..6fba99a3e 100644
--- a/searx/templates/simple/result_templates/code.html
+++ b/searx/templates/simple/result_templates/code.html
@@ -10,22 +10,28 @@
 {%- endif -%}
 {%- if result.repository -%}
   

{{- '' -}} - {{ _('repo') }}: {{- ' ' -}} + {{ _('Repository') }}: {{- ' ' -}} - {{- result.repository -}} + {% if results_on_new_tab %} + target="_blank" {{- ' ' -}} + rel="noopener noreferrer" + {%- else -%} + rel="noreferrer" + {%- endif -%} + > + {{- result.repository -}} {{- '' -}}

{%- endif -%} +{%- if result.filename %} +

+ {{ _('Filename') }}: {{ result.filename|safe }} +

+{% endif -%} +
- {{- result.codelines|code_highlighter(result.code_language, result.hl_lines, result.strip_whitespace, result.strip_new_lines)|safe -}} + {{- result.HTML()|safe -}}
{{- result_sub_footer(result) -}} diff --git a/tests/unit/test_engine_github_code.py b/tests/unit/test_engine_github_code.py index d10081f28..13a560713 100644 --- a/tests/unit/test_engine_github_code.py +++ b/tests/unit/test_engine_github_code.py @@ -142,29 +142,26 @@ class GithubCodeTests(SearxTestCase): results = self.ghc.response(response) expected_results = EngineResults() expected_results.add( - expected_results.types.LegacyResult( - **{ - 'url': "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md", - 'title': "folke/dot · TODO.md", - 'content': "☕️ My Dot Files", - 'repository': "https://github.com/folke/dot", - 'codelines': [ - (1, "- [x] windows picker"), - (2, "- [x] toggle cwd / root (LazyVim)"), - (3, "- [x] dynamic workspace symbol"), - (4, "- [x] smart stops working after custom"), - (5, "- [x] edit in empty buffer"), - (6, "- [x] support toggling line nr for preview"), - ], - 'hl_lines': {2, 5, 6}, - 'code_language': "markdown", - 'template': 'code.html', - 'strip_whitespace': False, - 'strip_new_lines': True, - 'parsed_url': urlparse( - "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md" - ), - } + expected_results.types.Code( + url="https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md", + title="folke/dot · TODO.md", + content="☕️ My Dot Files", + repository="https://github.com/folke/dot", + codelines=[ + (1, "- [x] windows picker"), + (2, "- [x] toggle cwd / root (LazyVim)"), + (3, "- [x] dynamic workspace symbol"), + (4, "- [x] smart stops working after custom"), + (5, "- [x] edit in empty buffer"), + (6, "- [x] support toggling line nr for preview"), + ], + hl_lines={2, 5, 6}, + code_language="markdown", + strip_whitespace=False, + strip_new_lines=True, + parsed_url=urlparse( + "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md" + ), ) ) self.assertEqual(results, expected_results)