mirror of
https://github.com/searxng/searxng.git
synced 2025-07-24 13:49:26 +02:00
commit
aaae9a209e
21 changed files with 47 additions and 75 deletions
|
@ -1,33 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# This script saves Ahmia's blacklist for onion sites.
|
||||
# More info in https://ahmia.fi/blacklist/
|
||||
|
||||
# set path
|
||||
from sys import path
|
||||
from os.path import realpath, dirname, join
|
||||
path.append(realpath(dirname(realpath(__file__)) + '/../'))
|
||||
|
||||
#
|
||||
import requests
|
||||
from searx import searx_dir
|
||||
|
||||
URL = 'https://ahmia.fi/blacklist/banned/'
|
||||
|
||||
|
||||
def fetch_ahmia_blacklist():
|
||||
resp = requests.get(URL, timeout=3.0)
|
||||
if resp.status_code != 200:
|
||||
raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code)
|
||||
else:
|
||||
blacklist = resp.text.split()
|
||||
return blacklist
|
||||
|
||||
|
||||
def get_ahmia_blacklist_filename():
|
||||
return join(join(searx_dir, "data"), "ahmia_blacklist.txt")
|
||||
|
||||
|
||||
blacklist = fetch_ahmia_blacklist()
|
||||
with open(get_ahmia_blacklist_filename(), "w") as f:
|
||||
f.write('\n'.join(blacklist))
|
|
@ -1,151 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
import json
|
||||
|
||||
# set path
|
||||
from sys import path
|
||||
from os.path import realpath, dirname, join
|
||||
path.append(realpath(dirname(realpath(__file__)) + '/../'))
|
||||
|
||||
from searx import searx_dir, settings
|
||||
from searx.engines.wikidata import send_wikidata_query
|
||||
|
||||
|
||||
# ORDER BY (with all the query fields) is important to keep a deterministic result order
|
||||
# so multiple invokation of this script doesn't change currencies.json
|
||||
SARQL_REQUEST = """
|
||||
SELECT DISTINCT ?iso4217 ?unit ?unicode ?label ?alias WHERE {
|
||||
?item wdt:P498 ?iso4217; rdfs:label ?label.
|
||||
OPTIONAL { ?item skos:altLabel ?alias FILTER (LANG (?alias) = LANG(?label)). }
|
||||
OPTIONAL { ?item wdt:P5061 ?unit. }
|
||||
OPTIONAL { ?item wdt:P489 ?symbol.
|
||||
?symbol wdt:P487 ?unicode. }
|
||||
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
|
||||
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
|
||||
FILTER(LANG(?label) IN (%LANGUAGES_SPARQL%)).
|
||||
}
|
||||
ORDER BY ?iso4217 ?unit ?unicode ?label ?alias
|
||||
"""
|
||||
|
||||
# ORDER BY (with all the query fields) is important to keep a deterministic result order
|
||||
# so multiple invokation of this script doesn't change currencies.json
|
||||
SPARQL_WIKIPEDIA_NAMES_REQUEST = """
|
||||
SELECT DISTINCT ?iso4217 ?article_name WHERE {
|
||||
?item wdt:P498 ?iso4217 .
|
||||
?article schema:about ?item ;
|
||||
schema:name ?article_name ;
|
||||
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ]
|
||||
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
|
||||
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
|
||||
FILTER(LANG(?article_name) IN (%LANGUAGES_SPARQL%)).
|
||||
}
|
||||
ORDER BY ?iso4217 ?article_name
|
||||
"""
|
||||
|
||||
|
||||
LANGUAGES = settings['locales'].keys()
|
||||
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
|
||||
|
||||
|
||||
def remove_accents(name):
|
||||
return unicodedata.normalize('NFKD', name).lower()
|
||||
|
||||
|
||||
def remove_extra(name):
|
||||
for c in ('(', ':'):
|
||||
if c in name:
|
||||
name = name.split(c)[0].strip()
|
||||
return name
|
||||
|
||||
|
||||
def _normalize_name(name):
|
||||
name = re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' '))
|
||||
name = remove_extra(name)
|
||||
return name
|
||||
|
||||
|
||||
def add_currency_name(db, name, iso4217, normalize_name=True):
|
||||
db_names = db['names']
|
||||
|
||||
if normalize_name:
|
||||
name = _normalize_name(name)
|
||||
|
||||
iso4217_set = db_names.setdefault(name, [])
|
||||
if iso4217 not in iso4217_set:
|
||||
iso4217_set.insert(0, iso4217)
|
||||
|
||||
|
||||
def add_currency_label(db, label, iso4217, language):
|
||||
labels = db['iso4217'].setdefault(iso4217, {})
|
||||
labels[language] = label
|
||||
|
||||
|
||||
def wikidata_request_result_iterator(request):
|
||||
result = send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
|
||||
if result is not None:
|
||||
for r in result['results']['bindings']:
|
||||
yield r
|
||||
|
||||
|
||||
def fetch_db():
|
||||
db = {
|
||||
'names': {},
|
||||
'iso4217': {},
|
||||
}
|
||||
|
||||
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
|
||||
iso4217 = r['iso4217']['value']
|
||||
article_name = r['article_name']['value']
|
||||
article_lang = r['article_name']['xml:lang']
|
||||
add_currency_name(db, article_name, iso4217)
|
||||
add_currency_label(db, article_name, iso4217, article_lang)
|
||||
|
||||
for r in wikidata_request_result_iterator(SARQL_REQUEST):
|
||||
iso4217 = r['iso4217']['value']
|
||||
if 'label' in r:
|
||||
label = r['label']['value']
|
||||
label_lang = r['label']['xml:lang']
|
||||
add_currency_name(db, label, iso4217)
|
||||
add_currency_label(db, label, iso4217, label_lang)
|
||||
|
||||
if 'alias' in r:
|
||||
add_currency_name(db, r['alias']['value'], iso4217)
|
||||
|
||||
if 'unicode' in r:
|
||||
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False)
|
||||
|
||||
if 'unit' in r:
|
||||
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False)
|
||||
|
||||
# reduce memory usage:
|
||||
# replace lists with one item by the item.
|
||||
# see searx.search.processors.online_currency.name_to_iso4217
|
||||
for name in db['names']:
|
||||
if len(db['names'][name]) == 1:
|
||||
db['names'][name] = db['names'][name][0]
|
||||
|
||||
return db
|
||||
|
||||
|
||||
def get_filename():
|
||||
return join(join(searx_dir, "data"), "currencies.json")
|
||||
|
||||
|
||||
def main():
|
||||
#
|
||||
db = fetch_db()
|
||||
# static
|
||||
add_currency_name(db, "euro", 'EUR')
|
||||
add_currency_name(db, "euros", 'EUR')
|
||||
add_currency_name(db, "dollar", 'USD')
|
||||
add_currency_name(db, "dollars", 'USD')
|
||||
add_currency_name(db, "peso", 'MXN')
|
||||
add_currency_name(db, "pesos", 'MXN')
|
||||
|
||||
with open(get_filename(), 'w', encoding='utf8') as f:
|
||||
json.dump(db, f, ensure_ascii=False, indent=4)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,206 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import json
|
||||
from urllib.parse import quote, urlparse
|
||||
from os.path import realpath, dirname
|
||||
import cld3
|
||||
from lxml.html import fromstring
|
||||
|
||||
# set path
|
||||
sys.path.append(realpath(dirname(realpath(__file__)) + '/../'))
|
||||
|
||||
from searx.engines.wikidata import send_wikidata_query
|
||||
from searx.utils import extract_text
|
||||
import searx
|
||||
import searx.search
|
||||
import searx.poolrequests
|
||||
|
||||
SPARQL_WIKIPEDIA_ARTICLE = """
|
||||
SELECT DISTINCT ?item ?name
|
||||
WHERE {
|
||||
VALUES ?item { %IDS% }
|
||||
?article schema:about ?item ;
|
||||
schema:inLanguage ?lang ;
|
||||
schema:name ?name ;
|
||||
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] .
|
||||
FILTER(?lang in (%LANGUAGES_SPARQL%)) .
|
||||
FILTER (!CONTAINS(?name, ':')) .
|
||||
}
|
||||
"""
|
||||
|
||||
SPARQL_DESCRIPTION = """
|
||||
SELECT DISTINCT ?item ?itemDescription
|
||||
WHERE {
|
||||
VALUES ?item { %IDS% }
|
||||
?item schema:description ?itemDescription .
|
||||
FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%))
|
||||
}
|
||||
ORDER BY ?itemLang
|
||||
"""
|
||||
|
||||
LANGUAGES = searx.settings['locales'].keys()
|
||||
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
|
||||
IDS = None
|
||||
|
||||
descriptions = {}
|
||||
wd_to_engine_name = {}
|
||||
|
||||
|
||||
def normalize_description(description):
|
||||
for c in [chr(c) for c in range(0, 31)]:
|
||||
description = description.replace(c, ' ')
|
||||
description = ' '.join(description.strip().split())
|
||||
return description
|
||||
|
||||
|
||||
def update_description(engine_name, lang, description, source, replace=True):
|
||||
if replace or lang not in descriptions[engine_name]:
|
||||
descriptions[engine_name][lang] = [normalize_description(description), source]
|
||||
|
||||
|
||||
def get_wikipedia_summary(language, pageid):
|
||||
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
|
||||
url = search_url.format(title=quote(pageid), language=language)
|
||||
try:
|
||||
response = searx.poolrequests.get(url)
|
||||
response.raise_for_status()
|
||||
api_result = json.loads(response.text)
|
||||
return api_result.get('extract')
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def detect_language(text):
|
||||
r = cld3.get_language(str(text)) # pylint: disable=E1101
|
||||
if r is not None and r.probability >= 0.98 and r.is_reliable:
|
||||
return r.language
|
||||
return None
|
||||
|
||||
|
||||
def get_website_description(url, lang1, lang2=None):
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'DNT': '1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-GPC': '1',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
if lang1 is not None:
|
||||
lang_list = [lang1]
|
||||
if lang2 is not None:
|
||||
lang_list.append(lang2)
|
||||
headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
|
||||
try:
|
||||
response = searx.poolrequests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
except Exception:
|
||||
return (None, None)
|
||||
|
||||
try:
|
||||
html = fromstring(response.text)
|
||||
except ValueError:
|
||||
html = fromstring(response.content)
|
||||
|
||||
description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content'))
|
||||
if not description:
|
||||
description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content'))
|
||||
if not description:
|
||||
description = extract_text(html.xpath('/html/head/title'))
|
||||
lang = extract_text(html.xpath('/html/@lang'))
|
||||
if lang is None and len(lang1) > 0:
|
||||
lang = lang1
|
||||
lang = detect_language(description) or lang or 'en'
|
||||
lang = lang.split('_')[0]
|
||||
lang = lang.split('-')[0]
|
||||
return (lang, description)
|
||||
|
||||
|
||||
def initialize():
|
||||
global descriptions, wd_to_engine_name, IDS
|
||||
searx.search.initialize()
|
||||
for engine_name, engine in searx.engines.engines.items():
|
||||
descriptions[engine_name] = {}
|
||||
wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
|
||||
if wikidata_id is not None:
|
||||
wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name)
|
||||
|
||||
IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys())))
|
||||
|
||||
|
||||
def fetch_wikidata_descriptions():
|
||||
global IDS
|
||||
result = send_wikidata_query(SPARQL_DESCRIPTION
|
||||
.replace('%IDS%', IDS)
|
||||
.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
|
||||
if result is not None:
|
||||
for binding in result['results']['bindings']:
|
||||
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
||||
lang = binding['itemDescription']['xml:lang']
|
||||
description = binding['itemDescription']['value']
|
||||
if ' ' in description: # skip unique word description (like "website")
|
||||
for engine_name in wd_to_engine_name[wikidata_id]:
|
||||
update_description(engine_name, lang, description, 'wikidata')
|
||||
|
||||
|
||||
def fetch_wikipedia_descriptions():
|
||||
global IDS
|
||||
result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE
|
||||
.replace('%IDS%', IDS)
|
||||
.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
|
||||
if result is not None:
|
||||
for binding in result['results']['bindings']:
|
||||
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
||||
lang = binding['name']['xml:lang']
|
||||
pageid = binding['name']['value']
|
||||
description = get_wikipedia_summary(lang, pageid)
|
||||
if description is not None and ' ' in description:
|
||||
for engine_name in wd_to_engine_name[wikidata_id]:
|
||||
update_description(engine_name, lang, description, 'wikipedia')
|
||||
|
||||
|
||||
def normalize_url(url):
|
||||
url = url.replace('{language}', 'en')
|
||||
url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl()
|
||||
url = url.replace('https://api.', 'https://')
|
||||
return url
|
||||
|
||||
|
||||
def fetch_website_description(engine_name, website):
|
||||
default_lang, default_description = get_website_description(website, None, None)
|
||||
if default_lang is None or default_description is None:
|
||||
return
|
||||
if default_lang not in descriptions[engine_name]:
|
||||
descriptions[engine_name][default_lang] = [normalize_description(default_description), website]
|
||||
for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'):
|
||||
if request_lang.split('-')[0] not in descriptions[engine_name]:
|
||||
lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0])
|
||||
if desc is not None and desc != default_description:
|
||||
update_description(engine_name, lang, desc, website, replace=False)
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def fetch_website_descriptions():
|
||||
for engine_name, engine in searx.engines.engines.items():
|
||||
website = getattr(engine, "about", {}).get('website')
|
||||
if website is None:
|
||||
website = normalize_url(getattr(engine, "search_url"))
|
||||
if website is None:
|
||||
website = normalize_url(getattr(engine, "base_url"))
|
||||
if website is not None:
|
||||
fetch_website_description(engine_name, website)
|
||||
|
||||
|
||||
def main():
|
||||
initialize()
|
||||
fetch_wikidata_descriptions()
|
||||
fetch_wikipedia_descriptions()
|
||||
fetch_website_descriptions()
|
||||
|
||||
sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,161 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Update searx/data/external_bangs.json using the duckduckgo bangs.
|
||||
|
||||
https://duckduckgo.com/newbang loads
|
||||
* a javascript which provides the bang version ( https://duckduckgo.com/bv1.js )
|
||||
* a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example )
|
||||
|
||||
This script loads the javascript, then the bangs.
|
||||
|
||||
The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ),
|
||||
but most probably it will requires to update RE_BANG_VERSION
|
||||
"""
|
||||
# pylint: disable=C0116
|
||||
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
from os.path import realpath, dirname, join
|
||||
|
||||
import requests
|
||||
|
||||
# set path
|
||||
sys.path.append(realpath(dirname(realpath(__file__)) + '/../'))
|
||||
|
||||
from searx import searx_dir # pylint: disable=E0401 C0413
|
||||
|
||||
|
||||
# from https://duckduckgo.com/newbang
|
||||
URL_BV1 = 'https://duckduckgo.com/bv1.js'
|
||||
RE_BANG_VERSION = re.compile(r'\/bang\.v([0-9]+)\.js')
|
||||
HTTPS_COLON = 'https:'
|
||||
HTTP_COLON = 'http:'
|
||||
|
||||
|
||||
def get_bang_url():
|
||||
response = requests.get(URL_BV1)
|
||||
response.raise_for_status()
|
||||
|
||||
r = RE_BANG_VERSION.findall(response.text)
|
||||
return f'https://duckduckgo.com/bang.v{r[0]}.js', r[0]
|
||||
|
||||
|
||||
def fetch_ddg_bangs(url):
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return json.loads(response.content.decode())
|
||||
|
||||
|
||||
def merge_when_no_leaf(node):
|
||||
"""Minimize the number of nodes
|
||||
|
||||
A -> B -> C
|
||||
B is child of A
|
||||
C is child of B
|
||||
|
||||
If there are no C equals to '*', then each C are merged into A
|
||||
|
||||
For example:
|
||||
d -> d -> g -> * (ddg*)
|
||||
-> i -> g -> * (dig*)
|
||||
becomes
|
||||
d -> dg -> *
|
||||
-> ig -> *
|
||||
"""
|
||||
restart = False
|
||||
if not isinstance(node, dict):
|
||||
return
|
||||
|
||||
# create a copy of the keys so node can be modified
|
||||
keys = list(node.keys())
|
||||
|
||||
for key in keys:
|
||||
if key == '*':
|
||||
continue
|
||||
|
||||
value = node[key]
|
||||
value_keys = list(value.keys())
|
||||
if '*' not in value_keys:
|
||||
for value_key in value_keys:
|
||||
node[key + value_key] = value[value_key]
|
||||
merge_when_no_leaf(node[key + value_key])
|
||||
del node[key]
|
||||
restart = True
|
||||
else:
|
||||
merge_when_no_leaf(value)
|
||||
|
||||
if restart:
|
||||
merge_when_no_leaf(node)
|
||||
|
||||
|
||||
def optimize_leaf(parent, parent_key, node):
|
||||
if not isinstance(node, dict):
|
||||
return
|
||||
|
||||
if len(node) == 1 and '*' in node and parent is not None:
|
||||
parent[parent_key] = node['*']
|
||||
else:
|
||||
for key, value in node.items():
|
||||
optimize_leaf(node, key, value)
|
||||
|
||||
|
||||
def parse_ddg_bangs(ddg_bangs):
|
||||
bang_trie = {}
|
||||
bang_urls = {}
|
||||
|
||||
for bang_definition in ddg_bangs:
|
||||
# bang_list
|
||||
bang_url = bang_definition['u']
|
||||
if '{{{s}}}' not in bang_url:
|
||||
# ignore invalid bang
|
||||
continue
|
||||
|
||||
bang_url = bang_url.replace('{{{s}}}', chr(2))
|
||||
|
||||
# only for the https protocol: "https://example.com" becomes "//example.com"
|
||||
if bang_url.startswith(HTTPS_COLON + '//'):
|
||||
bang_url = bang_url[len(HTTPS_COLON):]
|
||||
|
||||
#
|
||||
if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON):] in bang_urls:
|
||||
# if the bang_url uses the http:// protocol, and the same URL exists in https://
|
||||
# then reuse the https:// bang definition. (written //example.com)
|
||||
bang_def_output = bang_urls[bang_url[len(HTTP_COLON):]]
|
||||
else:
|
||||
# normal use case : new http:// URL or https:// URL (without "https:", see above)
|
||||
bang_rank = str(bang_definition['r'])
|
||||
bang_def_output = bang_url + chr(1) + bang_rank
|
||||
bang_def_output = bang_urls.setdefault(bang_url, bang_def_output)
|
||||
|
||||
bang_urls[bang_url] = bang_def_output
|
||||
|
||||
# bang name
|
||||
bang = bang_definition['t']
|
||||
|
||||
# bang_trie
|
||||
t = bang_trie
|
||||
for bang_letter in bang:
|
||||
t = t.setdefault(bang_letter, {})
|
||||
t = t.setdefault('*', bang_def_output)
|
||||
|
||||
# optimize the trie
|
||||
merge_when_no_leaf(bang_trie)
|
||||
optimize_leaf(None, None, bang_trie)
|
||||
|
||||
return bang_trie
|
||||
|
||||
|
||||
def get_bangs_filename():
|
||||
return join(join(searx_dir, "data"), "external_bangs.json")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
bangs_url, bangs_version = get_bang_url()
|
||||
print(f'fetch bangs from {bangs_url}')
|
||||
output = {
|
||||
'version': bangs_version,
|
||||
'trie': parse_ddg_bangs(fetch_ddg_bangs(bangs_url))
|
||||
}
|
||||
with open(get_bangs_filename(), 'w') as fp:
|
||||
json.dump(output, fp, ensure_ascii=False, indent=4)
|
|
@ -1,73 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# set path
|
||||
from sys import path
|
||||
from os.path import realpath, dirname, join
|
||||
path.append(realpath(dirname(realpath(__file__)) + '/../'))
|
||||
|
||||
#
|
||||
import json
|
||||
import requests
|
||||
import re
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from distutils.version import LooseVersion, StrictVersion
|
||||
from lxml import html
|
||||
from searx import searx_dir
|
||||
|
||||
URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
|
||||
RELEASE_PATH = '/pub/firefox/releases/'
|
||||
|
||||
NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$')
|
||||
# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$')
|
||||
# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$')
|
||||
|
||||
#
|
||||
useragents = {
|
||||
"versions": (),
|
||||
"os": ('Windows NT 10.0; WOW64',
|
||||
'X11; Linux x86_64'),
|
||||
"ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}"
|
||||
}
|
||||
|
||||
|
||||
def fetch_firefox_versions():
|
||||
resp = requests.get(URL, timeout=2.0)
|
||||
if resp.status_code != 200:
|
||||
raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code)
|
||||
else:
|
||||
dom = html.fromstring(resp.text)
|
||||
versions = []
|
||||
|
||||
for link in dom.xpath('//a/@href'):
|
||||
url = urlparse(urljoin(URL, link))
|
||||
path = url.path
|
||||
if path.startswith(RELEASE_PATH):
|
||||
version = path[len(RELEASE_PATH):-1]
|
||||
if NORMAL_REGEX.match(version):
|
||||
versions.append(LooseVersion(version))
|
||||
|
||||
list.sort(versions, reverse=True)
|
||||
return versions
|
||||
|
||||
|
||||
def fetch_firefox_last_versions():
|
||||
versions = fetch_firefox_versions()
|
||||
|
||||
result = []
|
||||
major_last = versions[0].version[0]
|
||||
major_list = (major_last, major_last - 1)
|
||||
for version in versions:
|
||||
major_current = version.version[0]
|
||||
if major_current in major_list:
|
||||
result.append(version.vstring)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_useragents_filename():
|
||||
return join(join(searx_dir, "data"), "useragents.json")
|
||||
|
||||
|
||||
useragents["versions"] = fetch_firefox_last_versions()
|
||||
with open(get_useragents_filename(), "w") as f:
|
||||
json.dump(useragents, f, indent=4, ensure_ascii=False)
|
|
@ -1,207 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This script generates languages.py from intersecting each engine's supported languages.
|
||||
#
|
||||
# Output files: searx/data/engines_languages.json and searx/languages.py
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
from sys import path
|
||||
from babel import Locale, UnknownLocaleError
|
||||
from babel.languages import get_global
|
||||
|
||||
path.append('../searx') # noqa
|
||||
from searx import settings, searx_dir
|
||||
from searx.engines import initialize_engines, engines
|
||||
|
||||
# Output files.
|
||||
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
|
||||
languages_file = Path(searx_dir) / 'languages.py'
|
||||
|
||||
|
||||
# Fetchs supported languages for each engine and writes json file with those.
|
||||
def fetch_supported_languages():
|
||||
|
||||
engines_languages = dict()
|
||||
names = list(engines)
|
||||
names.sort()
|
||||
|
||||
for engine_name in names:
|
||||
if hasattr(engines[engine_name], 'fetch_supported_languages'):
|
||||
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
|
||||
print("fetched %s languages from engine %s" % (
|
||||
len(engines_languages[engine_name]), engine_name))
|
||||
if type(engines_languages[engine_name]) == list:
|
||||
engines_languages[engine_name] = sorted(engines_languages[engine_name])
|
||||
|
||||
# write json file
|
||||
with open(engines_languages_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(engines_languages, f, indent=2, sort_keys=True)
|
||||
|
||||
return engines_languages
|
||||
|
||||
|
||||
# Get babel Locale object from lang_code if possible.
|
||||
def get_locale(lang_code):
|
||||
try:
|
||||
locale = Locale.parse(lang_code, sep='-')
|
||||
return locale
|
||||
except (UnknownLocaleError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
# Join all language lists.
|
||||
def join_language_lists(engines_languages):
|
||||
language_list = dict()
|
||||
for engine_name in engines_languages:
|
||||
for lang_code in engines_languages[engine_name]:
|
||||
|
||||
# apply custom fixes if necessary
|
||||
if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
|
||||
lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
|
||||
if lang_code == alias)
|
||||
|
||||
locale = get_locale(lang_code)
|
||||
|
||||
# ensure that lang_code uses standard language and country codes
|
||||
if locale and locale.territory:
|
||||
lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
|
||||
short_code = lang_code.split('-')[0]
|
||||
|
||||
# add language without country if not in list
|
||||
if short_code not in language_list:
|
||||
if locale:
|
||||
# get language's data from babel's Locale object
|
||||
language_name = locale.get_language_name().title()
|
||||
english_name = locale.english_name.split(' (')[0]
|
||||
elif short_code in engines_languages['wikipedia']:
|
||||
# get language's data from wikipedia if not known by babel
|
||||
language_name = engines_languages['wikipedia'][short_code]['name']
|
||||
english_name = engines_languages['wikipedia'][short_code]['english_name']
|
||||
else:
|
||||
language_name = None
|
||||
english_name = None
|
||||
|
||||
# add language to list
|
||||
language_list[short_code] = {'name': language_name,
|
||||
'english_name': english_name,
|
||||
'counter': set(),
|
||||
'countries': dict()}
|
||||
|
||||
# add language with country if not in list
|
||||
if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
|
||||
country_name = ''
|
||||
if locale:
|
||||
# get country name from babel's Locale object
|
||||
country_name = locale.get_territory_name()
|
||||
|
||||
language_list[short_code]['countries'][lang_code] = {'country_name': country_name,
|
||||
'counter': set()}
|
||||
|
||||
# count engine for both language_country combination and language alone
|
||||
language_list[short_code]['counter'].add(engine_name)
|
||||
if lang_code != short_code:
|
||||
language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
|
||||
|
||||
return language_list
|
||||
|
||||
|
||||
# Filter language list so it only includes the most supported languages and countries
|
||||
def filter_language_list(all_languages):
|
||||
min_engines_per_lang = 15
|
||||
min_engines_per_country = 10
|
||||
main_engines = [engine_name for engine_name in engines.keys()
|
||||
if 'general' in engines[engine_name].categories and
|
||||
engines[engine_name].supported_languages and
|
||||
not engines[engine_name].disabled]
|
||||
|
||||
# filter list to include only languages supported by most engines or all default general engines
|
||||
filtered_languages = {code: lang for code, lang
|
||||
in all_languages.items()
|
||||
if (len(lang['counter']) >= min_engines_per_lang or
|
||||
all(main_engine in lang['counter']
|
||||
for main_engine in main_engines))}
|
||||
|
||||
def _copy_lang_data(lang, country_name=None):
|
||||
new_dict = dict()
|
||||
new_dict['name'] = all_languages[lang]['name']
|
||||
new_dict['english_name'] = all_languages[lang]['english_name']
|
||||
if country_name:
|
||||
new_dict['country_name'] = country_name
|
||||
return new_dict
|
||||
|
||||
def _country_count(i):
|
||||
return len(countries[sorted_countries[i]]['counter'])
|
||||
|
||||
# for each language get country codes supported by most engines or at least one country code
|
||||
filtered_languages_with_countries = dict()
|
||||
for lang, lang_data in filtered_languages.items():
|
||||
countries = lang_data['countries']
|
||||
filtered_countries = dict()
|
||||
|
||||
# get language's country codes with enough supported engines
|
||||
for lang_country, country_data in countries.items():
|
||||
if len(country_data['counter']) >= min_engines_per_country:
|
||||
filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
|
||||
|
||||
# add language without countries too if there's more than one country to choose from
|
||||
if len(filtered_countries) > 1:
|
||||
filtered_countries[lang] = _copy_lang_data(lang)
|
||||
elif len(filtered_countries) == 1:
|
||||
# if there's only one country per language, it's not necessary to show country name
|
||||
lang_country = next(iter(filtered_countries))
|
||||
filtered_countries[lang_country]['country_name'] = None
|
||||
|
||||
# if no country has enough engines try to get most likely country code from babel
|
||||
if not filtered_countries:
|
||||
lang_country = None
|
||||
subtags = get_global('likely_subtags').get(lang)
|
||||
if subtags:
|
||||
country_code = subtags.split('_')[-1]
|
||||
if len(country_code) == 2:
|
||||
lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
|
||||
|
||||
if lang_country:
|
||||
filtered_countries[lang_country] = _copy_lang_data(lang)
|
||||
else:
|
||||
filtered_countries[lang] = _copy_lang_data(lang)
|
||||
|
||||
filtered_languages_with_countries.update(filtered_countries)
|
||||
|
||||
return filtered_languages_with_countries
|
||||
|
||||
|
||||
# Write languages.py.
|
||||
def write_languages_file(languages):
|
||||
file_headers = (
|
||||
"# -*- coding: utf-8 -*-",
|
||||
"# list of language codes",
|
||||
"# this file is generated automatically by utils/fetch_languages.py",
|
||||
"language_codes ="
|
||||
)
|
||||
|
||||
language_codes = tuple([
|
||||
(
|
||||
code,
|
||||
languages[code]['name'].split(' (')[0],
|
||||
languages[code].get('country_name') or '',
|
||||
languages[code].get('english_name') or ''
|
||||
) for code in sorted(languages)
|
||||
])
|
||||
|
||||
with open(languages_file, 'w') as new_file:
|
||||
file_content = "{file_headers} \\\n{language_codes}".format(
|
||||
file_headers='\n'.join(file_headers),
|
||||
language_codes=pformat(language_codes, indent=4)
|
||||
)
|
||||
new_file.write(file_content)
|
||||
new_file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
initialize_engines(settings['engines'])
|
||||
engines_languages = fetch_supported_languages()
|
||||
all_languages = join_language_lists(engines_languages)
|
||||
filtered_languages = filter_language_list(all_languages)
|
||||
write_languages_file(filtered_languages)
|
|
@ -1,56 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import json
|
||||
import collections
|
||||
|
||||
# set path
|
||||
from sys import path
|
||||
from os.path import realpath, dirname, join
|
||||
path.append(realpath(dirname(realpath(__file__)) + '/../'))
|
||||
|
||||
from searx import searx_dir
|
||||
from searx.engines.wikidata import send_wikidata_query
|
||||
|
||||
|
||||
# the response contains duplicate ?item with the different ?symbol
|
||||
# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
|
||||
# even if a ?item has different ?symbol of the same rank.
|
||||
# A deterministic result
|
||||
# see:
|
||||
# * https://www.wikidata.org/wiki/Help:Ranking
|
||||
# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
|
||||
# * https://w.wiki/32BT
|
||||
# see the result for https://www.wikidata.org/wiki/Q11582
|
||||
# there are multiple symbols the same rank
|
||||
SARQL_REQUEST = """
|
||||
SELECT DISTINCT ?item ?symbol
|
||||
WHERE
|
||||
{
|
||||
?item wdt:P31/wdt:P279 wd:Q47574 .
|
||||
?item p:P5061 ?symbolP .
|
||||
?symbolP ps:P5061 ?symbol ;
|
||||
wikibase:rank ?rank .
|
||||
FILTER(LANG(?symbol) = "en").
|
||||
}
|
||||
ORDER BY ?item DESC(?rank) ?symbol
|
||||
"""
|
||||
|
||||
|
||||
def get_data():
|
||||
results = collections.OrderedDict()
|
||||
response = send_wikidata_query(SARQL_REQUEST)
|
||||
for unit in response['results']['bindings']:
|
||||
name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
||||
unit = unit['symbol']['value']
|
||||
if name not in results:
|
||||
# ignore duplicate: always use the first one
|
||||
results[name] = unit
|
||||
return results
|
||||
|
||||
|
||||
def get_wikidata_units_filename():
|
||||
return join(join(searx_dir, "data"), "wikidata_units.json")
|
||||
|
||||
|
||||
with open(get_wikidata_units_filename(), 'w') as f:
|
||||
json.dump(get_data(), f, indent=4, ensure_ascii=False)
|
|
@ -1,35 +0,0 @@
|
|||
from sys import argv, exit
|
||||
|
||||
if not len(argv) > 1:
|
||||
print('search query required')
|
||||
exit(1)
|
||||
|
||||
import requests
|
||||
from json import dumps
|
||||
from searx.engines import google
|
||||
from searx.search import default_request_params
|
||||
|
||||
request_params = default_request_params()
|
||||
# Possible params
|
||||
# request_params['headers']['User-Agent'] = ''
|
||||
# request_params['category'] = ''
|
||||
request_params['pageno'] = 1
|
||||
request_params['language'] = 'en_us'
|
||||
request_params['time_range'] = ''
|
||||
|
||||
params = google.request(argv[1], request_params)
|
||||
|
||||
request_args = dict(
|
||||
headers=request_params['headers'],
|
||||
cookies=request_params['cookies'],
|
||||
)
|
||||
|
||||
if request_params['method'] == 'GET':
|
||||
req = requests.get
|
||||
else:
|
||||
req = requests.post
|
||||
request_args['data'] = request_params['data']
|
||||
|
||||
resp = req(request_params['url'], **request_args)
|
||||
resp.search_params = request_params
|
||||
print(dumps(google.response(resp)))
|
|
@ -1,217 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""Script to run searx from terminal.
|
||||
|
||||
Getting categories without initiate the engine will only return `['general']`
|
||||
|
||||
>>> import searx.engines
|
||||
... list(searx.engines.categories.keys())
|
||||
['general']
|
||||
>>> import searx.search
|
||||
... searx.search.initialize()
|
||||
... list(searx.engines.categories.keys())
|
||||
['general', 'it', 'science', 'images', 'news', 'videos', 'music', 'files', 'social media', 'map']
|
||||
|
||||
Example to use this script:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ python3 utils/standalone_searx.py rain
|
||||
|
||||
Example to run it from python:
|
||||
|
||||
>>> import importlib
|
||||
... import json
|
||||
... import sys
|
||||
... import searx.engines
|
||||
... import searx.search
|
||||
... search_query = 'rain'
|
||||
... # initialize engines
|
||||
... searx.search.initialize()
|
||||
... # load engines categories once instead of each time the function called
|
||||
... engine_cs = list(searx.engines.categories.keys())
|
||||
... # load module
|
||||
... spec = importlib.util.spec_from_file_location(
|
||||
... 'utils.standalone_searx', 'utils/standalone_searx.py')
|
||||
... sas = importlib.util.module_from_spec(spec)
|
||||
... spec.loader.exec_module(sas)
|
||||
... # use function from module
|
||||
... prog_args = sas.parse_argument([search_query], category_choices=engine_cs)
|
||||
... search_q = sas.get_search_query(prog_args, engine_categories=engine_cs)
|
||||
... res_dict = sas.to_dict(search_q)
|
||||
... sys.stdout.write(json.dumps(
|
||||
... res_dict, sort_keys=True, indent=4, ensure_ascii=False,
|
||||
... default=sas.json_serial))
|
||||
{
|
||||
"answers": [],
|
||||
"infoboxes": [ {...} ],
|
||||
"paging": true,
|
||||
"results": [... ],
|
||||
"results_number": 820000000.0,
|
||||
"search": {
|
||||
"lang": "all",
|
||||
"pageno": 1,
|
||||
"q": "rain",
|
||||
"safesearch": 0,
|
||||
"timerange": null
|
||||
},
|
||||
"suggestions": [...]
|
||||
}
|
||||
""" # noqa: E501
|
||||
# pylint: disable=pointless-string-statement
|
||||
'''
|
||||
searx is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
searx is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
||||
|
||||
(C) 2016- by Alexandre Flament, <alex@al-f.net>
|
||||
'''
|
||||
# pylint: disable=wrong-import-position
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from json import dumps
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import searx
|
||||
import searx.preferences
|
||||
import searx.query
|
||||
import searx.search
|
||||
import searx.webadapter
|
||||
|
||||
EngineCategoriesVar = Optional[List[str]]
|
||||
|
||||
|
||||
def get_search_query(
|
||||
args: argparse.Namespace, engine_categories: EngineCategoriesVar = None
|
||||
) -> searx.search.SearchQuery:
|
||||
"""Get search results for the query"""
|
||||
if engine_categories is None:
|
||||
engine_categories = list(searx.engines.categories.keys())
|
||||
try:
|
||||
category = args.category.decode('utf-8')
|
||||
except AttributeError:
|
||||
category = args.category
|
||||
form = {
|
||||
"q": args.query,
|
||||
"categories": category,
|
||||
"pageno": str(args.pageno),
|
||||
"language": args.lang,
|
||||
"time_range": args.timerange
|
||||
}
|
||||
preferences = searx.preferences.Preferences(
|
||||
['oscar'], engine_categories, searx.engines.engines, [])
|
||||
preferences.key_value_settings['safesearch'].parse(args.safesearch)
|
||||
|
||||
search_query = searx.webadapter.get_search_query_from_webapp(
|
||||
preferences, form)[0]
|
||||
return search_query
|
||||
|
||||
|
||||
def no_parsed_url(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Remove parsed url from dict."""
|
||||
for result in results:
|
||||
del result['parsed_url']
|
||||
return results
|
||||
|
||||
|
||||
def json_serial(obj: Any) -> Any:
|
||||
"""JSON serializer for objects not serializable by default json code.
|
||||
|
||||
:raise TypeError: raised when **obj** is not serializable
|
||||
"""
|
||||
if isinstance(obj, datetime):
|
||||
serial = obj.isoformat()
|
||||
return serial
|
||||
if isinstance(obj, bytes):
|
||||
return obj.decode('utf8')
|
||||
if isinstance(obj, set):
|
||||
return list(obj)
|
||||
raise TypeError("Type ({}) not serializable".format(type(obj)))
|
||||
|
||||
|
||||
def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]:
|
||||
"""Get result from parsed arguments."""
|
||||
result_container = searx.search.Search(search_query).search()
|
||||
result_container_json = {
|
||||
"search": {
|
||||
"q": search_query.query,
|
||||
"pageno": search_query.pageno,
|
||||
"lang": search_query.lang,
|
||||
"safesearch": search_query.safesearch,
|
||||
"timerange": search_query.time_range,
|
||||
},
|
||||
"results": no_parsed_url(result_container.get_ordered_results()),
|
||||
"infoboxes": result_container.infoboxes,
|
||||
"suggestions": list(result_container.suggestions),
|
||||
"answers": list(result_container.answers),
|
||||
"paging": result_container.paging,
|
||||
"results_number": result_container.results_number()
|
||||
}
|
||||
return result_container_json
|
||||
|
||||
|
||||
def parse_argument(
|
||||
args: Optional[List[str]]=None,
|
||||
category_choices: EngineCategoriesVar=None
|
||||
) -> argparse.Namespace:
|
||||
"""Parse command line.
|
||||
|
||||
:raise SystemExit: Query argument required on `args`
|
||||
|
||||
Examples:
|
||||
|
||||
>>> import importlib
|
||||
... # load module
|
||||
... spec = importlib.util.spec_from_file_location(
|
||||
... 'utils.standalone_searx', 'utils/standalone_searx.py')
|
||||
... sas = importlib.util.module_from_spec(spec)
|
||||
... spec.loader.exec_module(sas)
|
||||
... sas.parse_argument()
|
||||
usage: ptipython [-h] [--category [{general}]] [--lang [LANG]] [--pageno [PAGENO]] [--safesearch [{0,1,2}]] [--timerange [{day,week,month,year}]]
|
||||
query
|
||||
SystemExit: 2
|
||||
>>> sas.parse_argument(['rain'])
|
||||
Namespace(category='general', lang='all', pageno=1, query='rain', safesearch='0', timerange=None)
|
||||
""" # noqa: E501
|
||||
if not category_choices:
|
||||
category_choices = list(searx.engines.categories.keys())
|
||||
parser = argparse.ArgumentParser(description='Standalone searx.')
|
||||
parser.add_argument('query', type=str,
|
||||
help='Text query')
|
||||
parser.add_argument('--category', type=str, nargs='?',
|
||||
choices=category_choices,
|
||||
default='general',
|
||||
help='Search category')
|
||||
parser.add_argument('--lang', type=str, nargs='?', default='all',
|
||||
help='Search language')
|
||||
parser.add_argument('--pageno', type=int, nargs='?', default=1,
|
||||
help='Page number starting from 1')
|
||||
parser.add_argument(
|
||||
'--safesearch', type=str, nargs='?',
|
||||
choices=['0', '1', '2'], default='0',
|
||||
help='Safe content filter from none to strict')
|
||||
parser.add_argument(
|
||||
'--timerange', type=str,
|
||||
nargs='?', choices=['day', 'week', 'month', 'year'],
|
||||
help='Filter by time range')
|
||||
return parser.parse_args(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
searx.search.initialize()
|
||||
engine_cs = list(searx.engines.categories.keys())
|
||||
prog_args = parse_argument(category_choices=engine_cs)
|
||||
search_q = get_search_query(prog_args, engine_categories=engine_cs)
|
||||
res_dict = to_dict(search_q)
|
||||
sys.stdout.write(dumps(
|
||||
res_dict, sort_keys=True, indent=4, ensure_ascii=False,
|
||||
default=json_serial))
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/sh
|
||||
|
||||
# script to easily update translation language files
|
||||
|
||||
# add new language:
|
||||
# pybabel init -i messages.pot -d searx/translations -l en
|
||||
|
||||
SEARX_DIR='searx'
|
||||
|
||||
pybabel extract -F babel.cfg -o messages.pot "$SEARX_DIR"
|
||||
for f in `ls "$SEARX_DIR"'/translations/'`; do
|
||||
pybabel update -N -i messages.pot -d "$SEARX_DIR"'/translations/' -l "$f"
|
||||
done
|
||||
|
||||
echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/'
|
Loading…
Add table
Add a link
Reference in a new issue