Merge pull request #2600 from dalf/searx-extra

Add searx_extra package
This commit is contained in:
Alexandre Flament 2021-03-05 09:43:39 +01:00 committed by GitHub
commit aaae9a209e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
21 changed files with 47 additions and 75 deletions

View file

@ -1,33 +0,0 @@
#!/usr/bin/env python
# This script saves Ahmia's blacklist for onion sites.
# More info in https://ahmia.fi/blacklist/
# set path
from sys import path
from os.path import realpath, dirname, join
path.append(realpath(dirname(realpath(__file__)) + '/../'))
#
import requests
from searx import searx_dir
URL = 'https://ahmia.fi/blacklist/banned/'
def fetch_ahmia_blacklist():
resp = requests.get(URL, timeout=3.0)
if resp.status_code != 200:
raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code)
else:
blacklist = resp.text.split()
return blacklist
def get_ahmia_blacklist_filename():
return join(join(searx_dir, "data"), "ahmia_blacklist.txt")
blacklist = fetch_ahmia_blacklist()
with open(get_ahmia_blacklist_filename(), "w") as f:
f.write('\n'.join(blacklist))

View file

@ -1,151 +0,0 @@
#!/usr/bin/env python
import re
import unicodedata
import json
# set path
from sys import path
from os.path import realpath, dirname, join
path.append(realpath(dirname(realpath(__file__)) + '/../'))
from searx import searx_dir, settings
from searx.engines.wikidata import send_wikidata_query
# ORDER BY (with all the query fields) is important to keep a deterministic result order
# so multiple invokation of this script doesn't change currencies.json
SARQL_REQUEST = """
SELECT DISTINCT ?iso4217 ?unit ?unicode ?label ?alias WHERE {
?item wdt:P498 ?iso4217; rdfs:label ?label.
OPTIONAL { ?item skos:altLabel ?alias FILTER (LANG (?alias) = LANG(?label)). }
OPTIONAL { ?item wdt:P5061 ?unit. }
OPTIONAL { ?item wdt:P489 ?symbol.
?symbol wdt:P487 ?unicode. }
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
FILTER(LANG(?label) IN (%LANGUAGES_SPARQL%)).
}
ORDER BY ?iso4217 ?unit ?unicode ?label ?alias
"""
# ORDER BY (with all the query fields) is important to keep a deterministic result order
# so multiple invokation of this script doesn't change currencies.json
SPARQL_WIKIPEDIA_NAMES_REQUEST = """
SELECT DISTINCT ?iso4217 ?article_name WHERE {
?item wdt:P498 ?iso4217 .
?article schema:about ?item ;
schema:name ?article_name ;
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ]
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
FILTER(LANG(?article_name) IN (%LANGUAGES_SPARQL%)).
}
ORDER BY ?iso4217 ?article_name
"""
LANGUAGES = settings['locales'].keys()
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
def remove_accents(name):
return unicodedata.normalize('NFKD', name).lower()
def remove_extra(name):
for c in ('(', ':'):
if c in name:
name = name.split(c)[0].strip()
return name
def _normalize_name(name):
name = re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' '))
name = remove_extra(name)
return name
def add_currency_name(db, name, iso4217, normalize_name=True):
db_names = db['names']
if normalize_name:
name = _normalize_name(name)
iso4217_set = db_names.setdefault(name, [])
if iso4217 not in iso4217_set:
iso4217_set.insert(0, iso4217)
def add_currency_label(db, label, iso4217, language):
labels = db['iso4217'].setdefault(iso4217, {})
labels[language] = label
def wikidata_request_result_iterator(request):
result = send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None:
for r in result['results']['bindings']:
yield r
def fetch_db():
db = {
'names': {},
'iso4217': {},
}
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
iso4217 = r['iso4217']['value']
article_name = r['article_name']['value']
article_lang = r['article_name']['xml:lang']
add_currency_name(db, article_name, iso4217)
add_currency_label(db, article_name, iso4217, article_lang)
for r in wikidata_request_result_iterator(SARQL_REQUEST):
iso4217 = r['iso4217']['value']
if 'label' in r:
label = r['label']['value']
label_lang = r['label']['xml:lang']
add_currency_name(db, label, iso4217)
add_currency_label(db, label, iso4217, label_lang)
if 'alias' in r:
add_currency_name(db, r['alias']['value'], iso4217)
if 'unicode' in r:
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False)
if 'unit' in r:
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False)
# reduce memory usage:
# replace lists with one item by the item.
# see searx.search.processors.online_currency.name_to_iso4217
for name in db['names']:
if len(db['names'][name]) == 1:
db['names'][name] = db['names'][name][0]
return db
def get_filename():
return join(join(searx_dir, "data"), "currencies.json")
def main():
#
db = fetch_db()
# static
add_currency_name(db, "euro", 'EUR')
add_currency_name(db, "euros", 'EUR')
add_currency_name(db, "dollar", 'USD')
add_currency_name(db, "dollars", 'USD')
add_currency_name(db, "peso", 'MXN')
add_currency_name(db, "pesos", 'MXN')
with open(get_filename(), 'w', encoding='utf8') as f:
json.dump(db, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
main()

View file

@ -1,206 +0,0 @@
#!/usr/bin/env python
import sys
import json
from urllib.parse import quote, urlparse
from os.path import realpath, dirname
import cld3
from lxml.html import fromstring
# set path
sys.path.append(realpath(dirname(realpath(__file__)) + '/../'))
from searx.engines.wikidata import send_wikidata_query
from searx.utils import extract_text
import searx
import searx.search
import searx.poolrequests
SPARQL_WIKIPEDIA_ARTICLE = """
SELECT DISTINCT ?item ?name
WHERE {
VALUES ?item { %IDS% }
?article schema:about ?item ;
schema:inLanguage ?lang ;
schema:name ?name ;
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] .
FILTER(?lang in (%LANGUAGES_SPARQL%)) .
FILTER (!CONTAINS(?name, ':')) .
}
"""
SPARQL_DESCRIPTION = """
SELECT DISTINCT ?item ?itemDescription
WHERE {
VALUES ?item { %IDS% }
?item schema:description ?itemDescription .
FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%))
}
ORDER BY ?itemLang
"""
LANGUAGES = searx.settings['locales'].keys()
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
IDS = None
descriptions = {}
wd_to_engine_name = {}
def normalize_description(description):
for c in [chr(c) for c in range(0, 31)]:
description = description.replace(c, ' ')
description = ' '.join(description.strip().split())
return description
def update_description(engine_name, lang, description, source, replace=True):
if replace or lang not in descriptions[engine_name]:
descriptions[engine_name][lang] = [normalize_description(description), source]
def get_wikipedia_summary(language, pageid):
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
url = search_url.format(title=quote(pageid), language=language)
try:
response = searx.poolrequests.get(url)
response.raise_for_status()
api_result = json.loads(response.text)
return api_result.get('extract')
except:
return None
def detect_language(text):
r = cld3.get_language(str(text)) # pylint: disable=E1101
if r is not None and r.probability >= 0.98 and r.is_reliable:
return r.language
return None
def get_website_description(url, lang1, lang2=None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Sec-GPC': '1',
'Cache-Control': 'max-age=0',
}
if lang1 is not None:
lang_list = [lang1]
if lang2 is not None:
lang_list.append(lang2)
headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
try:
response = searx.poolrequests.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception:
return (None, None)
try:
html = fromstring(response.text)
except ValueError:
html = fromstring(response.content)
description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content'))
if not description:
description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content'))
if not description:
description = extract_text(html.xpath('/html/head/title'))
lang = extract_text(html.xpath('/html/@lang'))
if lang is None and len(lang1) > 0:
lang = lang1
lang = detect_language(description) or lang or 'en'
lang = lang.split('_')[0]
lang = lang.split('-')[0]
return (lang, description)
def initialize():
global descriptions, wd_to_engine_name, IDS
searx.search.initialize()
for engine_name, engine in searx.engines.engines.items():
descriptions[engine_name] = {}
wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
if wikidata_id is not None:
wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name)
IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys())))
def fetch_wikidata_descriptions():
global IDS
result = send_wikidata_query(SPARQL_DESCRIPTION
.replace('%IDS%', IDS)
.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None:
for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
lang = binding['itemDescription']['xml:lang']
description = binding['itemDescription']['value']
if ' ' in description: # skip unique word description (like "website")
for engine_name in wd_to_engine_name[wikidata_id]:
update_description(engine_name, lang, description, 'wikidata')
def fetch_wikipedia_descriptions():
global IDS
result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE
.replace('%IDS%', IDS)
.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None:
for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
lang = binding['name']['xml:lang']
pageid = binding['name']['value']
description = get_wikipedia_summary(lang, pageid)
if description is not None and ' ' in description:
for engine_name in wd_to_engine_name[wikidata_id]:
update_description(engine_name, lang, description, 'wikipedia')
def normalize_url(url):
url = url.replace('{language}', 'en')
url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl()
url = url.replace('https://api.', 'https://')
return url
def fetch_website_description(engine_name, website):
default_lang, default_description = get_website_description(website, None, None)
if default_lang is None or default_description is None:
return
if default_lang not in descriptions[engine_name]:
descriptions[engine_name][default_lang] = [normalize_description(default_description), website]
for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'):
if request_lang.split('-')[0] not in descriptions[engine_name]:
lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0])
if desc is not None and desc != default_description:
update_description(engine_name, lang, desc, website, replace=False)
else:
break
def fetch_website_descriptions():
for engine_name, engine in searx.engines.engines.items():
website = getattr(engine, "about", {}).get('website')
if website is None:
website = normalize_url(getattr(engine, "search_url"))
if website is None:
website = normalize_url(getattr(engine, "base_url"))
if website is not None:
fetch_website_description(engine_name, website)
def main():
initialize()
fetch_wikidata_descriptions()
fetch_wikipedia_descriptions()
fetch_website_descriptions()
sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False))
if __name__ == "__main__":
main()

View file

@ -1,161 +0,0 @@
#!/usr/bin/env python
"""
Update searx/data/external_bangs.json using the duckduckgo bangs.
https://duckduckgo.com/newbang loads
* a javascript which provides the bang version ( https://duckduckgo.com/bv1.js )
* a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example )
This script loads the javascript, then the bangs.
The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ),
but most probably it will requires to update RE_BANG_VERSION
"""
# pylint: disable=C0116
import sys
import json
import re
from os.path import realpath, dirname, join
import requests
# set path
sys.path.append(realpath(dirname(realpath(__file__)) + '/../'))
from searx import searx_dir # pylint: disable=E0401 C0413
# from https://duckduckgo.com/newbang
URL_BV1 = 'https://duckduckgo.com/bv1.js'
RE_BANG_VERSION = re.compile(r'\/bang\.v([0-9]+)\.js')
HTTPS_COLON = 'https:'
HTTP_COLON = 'http:'
def get_bang_url():
response = requests.get(URL_BV1)
response.raise_for_status()
r = RE_BANG_VERSION.findall(response.text)
return f'https://duckduckgo.com/bang.v{r[0]}.js', r[0]
def fetch_ddg_bangs(url):
response = requests.get(url)
response.raise_for_status()
return json.loads(response.content.decode())
def merge_when_no_leaf(node):
"""Minimize the number of nodes
A -> B -> C
B is child of A
C is child of B
If there are no C equals to '*', then each C are merged into A
For example:
d -> d -> g -> * (ddg*)
-> i -> g -> * (dig*)
becomes
d -> dg -> *
-> ig -> *
"""
restart = False
if not isinstance(node, dict):
return
# create a copy of the keys so node can be modified
keys = list(node.keys())
for key in keys:
if key == '*':
continue
value = node[key]
value_keys = list(value.keys())
if '*' not in value_keys:
for value_key in value_keys:
node[key + value_key] = value[value_key]
merge_when_no_leaf(node[key + value_key])
del node[key]
restart = True
else:
merge_when_no_leaf(value)
if restart:
merge_when_no_leaf(node)
def optimize_leaf(parent, parent_key, node):
if not isinstance(node, dict):
return
if len(node) == 1 and '*' in node and parent is not None:
parent[parent_key] = node['*']
else:
for key, value in node.items():
optimize_leaf(node, key, value)
def parse_ddg_bangs(ddg_bangs):
bang_trie = {}
bang_urls = {}
for bang_definition in ddg_bangs:
# bang_list
bang_url = bang_definition['u']
if '{{{s}}}' not in bang_url:
# ignore invalid bang
continue
bang_url = bang_url.replace('{{{s}}}', chr(2))
# only for the https protocol: "https://example.com" becomes "//example.com"
if bang_url.startswith(HTTPS_COLON + '//'):
bang_url = bang_url[len(HTTPS_COLON):]
#
if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON):] in bang_urls:
# if the bang_url uses the http:// protocol, and the same URL exists in https://
# then reuse the https:// bang definition. (written //example.com)
bang_def_output = bang_urls[bang_url[len(HTTP_COLON):]]
else:
# normal use case : new http:// URL or https:// URL (without "https:", see above)
bang_rank = str(bang_definition['r'])
bang_def_output = bang_url + chr(1) + bang_rank
bang_def_output = bang_urls.setdefault(bang_url, bang_def_output)
bang_urls[bang_url] = bang_def_output
# bang name
bang = bang_definition['t']
# bang_trie
t = bang_trie
for bang_letter in bang:
t = t.setdefault(bang_letter, {})
t = t.setdefault('*', bang_def_output)
# optimize the trie
merge_when_no_leaf(bang_trie)
optimize_leaf(None, None, bang_trie)
return bang_trie
def get_bangs_filename():
return join(join(searx_dir, "data"), "external_bangs.json")
if __name__ == '__main__':
bangs_url, bangs_version = get_bang_url()
print(f'fetch bangs from {bangs_url}')
output = {
'version': bangs_version,
'trie': parse_ddg_bangs(fetch_ddg_bangs(bangs_url))
}
with open(get_bangs_filename(), 'w') as fp:
json.dump(output, fp, ensure_ascii=False, indent=4)

View file

@ -1,73 +0,0 @@
#!/usr/bin/env python
# set path
from sys import path
from os.path import realpath, dirname, join
path.append(realpath(dirname(realpath(__file__)) + '/../'))
#
import json
import requests
import re
from urllib.parse import urlparse, urljoin
from distutils.version import LooseVersion, StrictVersion
from lxml import html
from searx import searx_dir
URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
RELEASE_PATH = '/pub/firefox/releases/'
NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$')
# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$')
# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$')
#
useragents = {
"versions": (),
"os": ('Windows NT 10.0; WOW64',
'X11; Linux x86_64'),
"ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}"
}
def fetch_firefox_versions():
resp = requests.get(URL, timeout=2.0)
if resp.status_code != 200:
raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code)
else:
dom = html.fromstring(resp.text)
versions = []
for link in dom.xpath('//a/@href'):
url = urlparse(urljoin(URL, link))
path = url.path
if path.startswith(RELEASE_PATH):
version = path[len(RELEASE_PATH):-1]
if NORMAL_REGEX.match(version):
versions.append(LooseVersion(version))
list.sort(versions, reverse=True)
return versions
def fetch_firefox_last_versions():
versions = fetch_firefox_versions()
result = []
major_last = versions[0].version[0]
major_list = (major_last, major_last - 1)
for version in versions:
major_current = version.version[0]
if major_current in major_list:
result.append(version.vstring)
return result
def get_useragents_filename():
return join(join(searx_dir, "data"), "useragents.json")
useragents["versions"] = fetch_firefox_last_versions()
with open(get_useragents_filename(), "w") as f:
json.dump(useragents, f, indent=4, ensure_ascii=False)

View file

@ -1,207 +0,0 @@
# -*- coding: utf-8 -*-
# This script generates languages.py from intersecting each engine's supported languages.
#
# Output files: searx/data/engines_languages.json and searx/languages.py
import json
from pathlib import Path
from pprint import pformat
from sys import path
from babel import Locale, UnknownLocaleError
from babel.languages import get_global
path.append('../searx') # noqa
from searx import settings, searx_dir
from searx.engines import initialize_engines, engines
# Output files.
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
languages_file = Path(searx_dir) / 'languages.py'
# Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages():
engines_languages = dict()
names = list(engines)
names.sort()
for engine_name in names:
if hasattr(engines[engine_name], 'fetch_supported_languages'):
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
print("fetched %s languages from engine %s" % (
len(engines_languages[engine_name]), engine_name))
if type(engines_languages[engine_name]) == list:
engines_languages[engine_name] = sorted(engines_languages[engine_name])
# write json file
with open(engines_languages_file, 'w', encoding='utf-8') as f:
json.dump(engines_languages, f, indent=2, sort_keys=True)
return engines_languages
# Get babel Locale object from lang_code if possible.
def get_locale(lang_code):
try:
locale = Locale.parse(lang_code, sep='-')
return locale
except (UnknownLocaleError, ValueError):
return None
# Join all language lists.
def join_language_lists(engines_languages):
language_list = dict()
for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]:
# apply custom fixes if necessary
if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
if lang_code == alias)
locale = get_locale(lang_code)
# ensure that lang_code uses standard language and country codes
if locale and locale.territory:
lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
short_code = lang_code.split('-')[0]
# add language without country if not in list
if short_code not in language_list:
if locale:
# get language's data from babel's Locale object
language_name = locale.get_language_name().title()
english_name = locale.english_name.split(' (')[0]
elif short_code in engines_languages['wikipedia']:
# get language's data from wikipedia if not known by babel
language_name = engines_languages['wikipedia'][short_code]['name']
english_name = engines_languages['wikipedia'][short_code]['english_name']
else:
language_name = None
english_name = None
# add language to list
language_list[short_code] = {'name': language_name,
'english_name': english_name,
'counter': set(),
'countries': dict()}
# add language with country if not in list
if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
country_name = ''
if locale:
# get country name from babel's Locale object
country_name = locale.get_territory_name()
language_list[short_code]['countries'][lang_code] = {'country_name': country_name,
'counter': set()}
# count engine for both language_country combination and language alone
language_list[short_code]['counter'].add(engine_name)
if lang_code != short_code:
language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
return language_list
# Filter language list so it only includes the most supported languages and countries
def filter_language_list(all_languages):
min_engines_per_lang = 15
min_engines_per_country = 10
main_engines = [engine_name for engine_name in engines.keys()
if 'general' in engines[engine_name].categories and
engines[engine_name].supported_languages and
not engines[engine_name].disabled]
# filter list to include only languages supported by most engines or all default general engines
filtered_languages = {code: lang for code, lang
in all_languages.items()
if (len(lang['counter']) >= min_engines_per_lang or
all(main_engine in lang['counter']
for main_engine in main_engines))}
def _copy_lang_data(lang, country_name=None):
new_dict = dict()
new_dict['name'] = all_languages[lang]['name']
new_dict['english_name'] = all_languages[lang]['english_name']
if country_name:
new_dict['country_name'] = country_name
return new_dict
def _country_count(i):
return len(countries[sorted_countries[i]]['counter'])
# for each language get country codes supported by most engines or at least one country code
filtered_languages_with_countries = dict()
for lang, lang_data in filtered_languages.items():
countries = lang_data['countries']
filtered_countries = dict()
# get language's country codes with enough supported engines
for lang_country, country_data in countries.items():
if len(country_data['counter']) >= min_engines_per_country:
filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
# add language without countries too if there's more than one country to choose from
if len(filtered_countries) > 1:
filtered_countries[lang] = _copy_lang_data(lang)
elif len(filtered_countries) == 1:
# if there's only one country per language, it's not necessary to show country name
lang_country = next(iter(filtered_countries))
filtered_countries[lang_country]['country_name'] = None
# if no country has enough engines try to get most likely country code from babel
if not filtered_countries:
lang_country = None
subtags = get_global('likely_subtags').get(lang)
if subtags:
country_code = subtags.split('_')[-1]
if len(country_code) == 2:
lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
if lang_country:
filtered_countries[lang_country] = _copy_lang_data(lang)
else:
filtered_countries[lang] = _copy_lang_data(lang)
filtered_languages_with_countries.update(filtered_countries)
return filtered_languages_with_countries
# Write languages.py.
def write_languages_file(languages):
file_headers = (
"# -*- coding: utf-8 -*-",
"# list of language codes",
"# this file is generated automatically by utils/fetch_languages.py",
"language_codes ="
)
language_codes = tuple([
(
code,
languages[code]['name'].split(' (')[0],
languages[code].get('country_name') or '',
languages[code].get('english_name') or ''
) for code in sorted(languages)
])
with open(languages_file, 'w') as new_file:
file_content = "{file_headers} \\\n{language_codes}".format(
file_headers='\n'.join(file_headers),
language_codes=pformat(language_codes, indent=4)
)
new_file.write(file_content)
new_file.close()
if __name__ == "__main__":
initialize_engines(settings['engines'])
engines_languages = fetch_supported_languages()
all_languages = join_language_lists(engines_languages)
filtered_languages = filter_language_list(all_languages)
write_languages_file(filtered_languages)

View file

@ -1,56 +0,0 @@
#!/usr/bin/env python
import json
import collections
# set path
from sys import path
from os.path import realpath, dirname, join
path.append(realpath(dirname(realpath(__file__)) + '/../'))
from searx import searx_dir
from searx.engines.wikidata import send_wikidata_query
# the response contains duplicate ?item with the different ?symbol
# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
# even if a ?item has different ?symbol of the same rank.
# A deterministic result
# see:
# * https://www.wikidata.org/wiki/Help:Ranking
# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
# * https://w.wiki/32BT
# see the result for https://www.wikidata.org/wiki/Q11582
# there are multiple symbols the same rank
SARQL_REQUEST = """
SELECT DISTINCT ?item ?symbol
WHERE
{
?item wdt:P31/wdt:P279 wd:Q47574 .
?item p:P5061 ?symbolP .
?symbolP ps:P5061 ?symbol ;
wikibase:rank ?rank .
FILTER(LANG(?symbol) = "en").
}
ORDER BY ?item DESC(?rank) ?symbol
"""
def get_data():
results = collections.OrderedDict()
response = send_wikidata_query(SARQL_REQUEST)
for unit in response['results']['bindings']:
name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
unit = unit['symbol']['value']
if name not in results:
# ignore duplicate: always use the first one
results[name] = unit
return results
def get_wikidata_units_filename():
return join(join(searx_dir, "data"), "wikidata_units.json")
with open(get_wikidata_units_filename(), 'w') as f:
json.dump(get_data(), f, indent=4, ensure_ascii=False)

View file

@ -1,35 +0,0 @@
from sys import argv, exit
if not len(argv) > 1:
print('search query required')
exit(1)
import requests
from json import dumps
from searx.engines import google
from searx.search import default_request_params
request_params = default_request_params()
# Possible params
# request_params['headers']['User-Agent'] = ''
# request_params['category'] = ''
request_params['pageno'] = 1
request_params['language'] = 'en_us'
request_params['time_range'] = ''
params = google.request(argv[1], request_params)
request_args = dict(
headers=request_params['headers'],
cookies=request_params['cookies'],
)
if request_params['method'] == 'GET':
req = requests.get
else:
req = requests.post
request_args['data'] = request_params['data']
resp = req(request_params['url'], **request_args)
resp.search_params = request_params
print(dumps(google.response(resp)))

View file

@ -1,217 +0,0 @@
#!/usr/bin/env python
"""Script to run searx from terminal.
Getting categories without initiate the engine will only return `['general']`
>>> import searx.engines
... list(searx.engines.categories.keys())
['general']
>>> import searx.search
... searx.search.initialize()
... list(searx.engines.categories.keys())
['general', 'it', 'science', 'images', 'news', 'videos', 'music', 'files', 'social media', 'map']
Example to use this script:
.. code:: bash
$ python3 utils/standalone_searx.py rain
Example to run it from python:
>>> import importlib
... import json
... import sys
... import searx.engines
... import searx.search
... search_query = 'rain'
... # initialize engines
... searx.search.initialize()
... # load engines categories once instead of each time the function called
... engine_cs = list(searx.engines.categories.keys())
... # load module
... spec = importlib.util.spec_from_file_location(
... 'utils.standalone_searx', 'utils/standalone_searx.py')
... sas = importlib.util.module_from_spec(spec)
... spec.loader.exec_module(sas)
... # use function from module
... prog_args = sas.parse_argument([search_query], category_choices=engine_cs)
... search_q = sas.get_search_query(prog_args, engine_categories=engine_cs)
... res_dict = sas.to_dict(search_q)
... sys.stdout.write(json.dumps(
... res_dict, sort_keys=True, indent=4, ensure_ascii=False,
... default=sas.json_serial))
{
"answers": [],
"infoboxes": [ {...} ],
"paging": true,
"results": [... ],
"results_number": 820000000.0,
"search": {
"lang": "all",
"pageno": 1,
"q": "rain",
"safesearch": 0,
"timerange": null
},
"suggestions": [...]
}
""" # noqa: E501
# pylint: disable=pointless-string-statement
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2016- by Alexandre Flament, <alex@al-f.net>
'''
# pylint: disable=wrong-import-position
import argparse
import sys
from datetime import datetime
from json import dumps
from typing import Any, Dict, List, Optional
import searx
import searx.preferences
import searx.query
import searx.search
import searx.webadapter
EngineCategoriesVar = Optional[List[str]]
def get_search_query(
args: argparse.Namespace, engine_categories: EngineCategoriesVar = None
) -> searx.search.SearchQuery:
"""Get search results for the query"""
if engine_categories is None:
engine_categories = list(searx.engines.categories.keys())
try:
category = args.category.decode('utf-8')
except AttributeError:
category = args.category
form = {
"q": args.query,
"categories": category,
"pageno": str(args.pageno),
"language": args.lang,
"time_range": args.timerange
}
preferences = searx.preferences.Preferences(
['oscar'], engine_categories, searx.engines.engines, [])
preferences.key_value_settings['safesearch'].parse(args.safesearch)
search_query = searx.webadapter.get_search_query_from_webapp(
preferences, form)[0]
return search_query
def no_parsed_url(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Remove parsed url from dict."""
for result in results:
del result['parsed_url']
return results
def json_serial(obj: Any) -> Any:
"""JSON serializer for objects not serializable by default json code.
:raise TypeError: raised when **obj** is not serializable
"""
if isinstance(obj, datetime):
serial = obj.isoformat()
return serial
if isinstance(obj, bytes):
return obj.decode('utf8')
if isinstance(obj, set):
return list(obj)
raise TypeError("Type ({}) not serializable".format(type(obj)))
def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]:
"""Get result from parsed arguments."""
result_container = searx.search.Search(search_query).search()
result_container_json = {
"search": {
"q": search_query.query,
"pageno": search_query.pageno,
"lang": search_query.lang,
"safesearch": search_query.safesearch,
"timerange": search_query.time_range,
},
"results": no_parsed_url(result_container.get_ordered_results()),
"infoboxes": result_container.infoboxes,
"suggestions": list(result_container.suggestions),
"answers": list(result_container.answers),
"paging": result_container.paging,
"results_number": result_container.results_number()
}
return result_container_json
def parse_argument(
args: Optional[List[str]]=None,
category_choices: EngineCategoriesVar=None
) -> argparse.Namespace:
"""Parse command line.
:raise SystemExit: Query argument required on `args`
Examples:
>>> import importlib
... # load module
... spec = importlib.util.spec_from_file_location(
... 'utils.standalone_searx', 'utils/standalone_searx.py')
... sas = importlib.util.module_from_spec(spec)
... spec.loader.exec_module(sas)
... sas.parse_argument()
usage: ptipython [-h] [--category [{general}]] [--lang [LANG]] [--pageno [PAGENO]] [--safesearch [{0,1,2}]] [--timerange [{day,week,month,year}]]
query
SystemExit: 2
>>> sas.parse_argument(['rain'])
Namespace(category='general', lang='all', pageno=1, query='rain', safesearch='0', timerange=None)
""" # noqa: E501
if not category_choices:
category_choices = list(searx.engines.categories.keys())
parser = argparse.ArgumentParser(description='Standalone searx.')
parser.add_argument('query', type=str,
help='Text query')
parser.add_argument('--category', type=str, nargs='?',
choices=category_choices,
default='general',
help='Search category')
parser.add_argument('--lang', type=str, nargs='?', default='all',
help='Search language')
parser.add_argument('--pageno', type=int, nargs='?', default=1,
help='Page number starting from 1')
parser.add_argument(
'--safesearch', type=str, nargs='?',
choices=['0', '1', '2'], default='0',
help='Safe content filter from none to strict')
parser.add_argument(
'--timerange', type=str,
nargs='?', choices=['day', 'week', 'month', 'year'],
help='Filter by time range')
return parser.parse_args(args)
if __name__ == '__main__':
searx.search.initialize()
engine_cs = list(searx.engines.categories.keys())
prog_args = parse_argument(category_choices=engine_cs)
search_q = get_search_query(prog_args, engine_categories=engine_cs)
res_dict = to_dict(search_q)
sys.stdout.write(dumps(
res_dict, sort_keys=True, indent=4, ensure_ascii=False,
default=json_serial))

View file

@ -1,15 +0,0 @@
#!/bin/sh
# script to easily update translation language files
# add new language:
# pybabel init -i messages.pot -d searx/translations -l en
SEARX_DIR='searx'
pybabel extract -F babel.cfg -o messages.pot "$SEARX_DIR"
for f in `ls "$SEARX_DIR"'/translations/'`; do
pybabel update -N -i messages.pot -d "$SEARX_DIR"'/translations/' -l "$f"
done
echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/'