[enh] Add onions category with Ahmia, Not Evil and Torch

Xpath engine and results template changed to account for the fact that
archive.org doesn't cache .onions, though some onion engines migth have
their own cache.

Disabled by default. Can be enabled by setting the SOCKS proxies to
wherever Tor is listening and setting using_tor_proxy as True.

Requires Tor and updating packages.

To avoid manually adding the timeout on each engine, you can set
extra_proxy_timeout to account for Tor's (or whatever proxy used) extra
time.
This commit is contained in:
a01200356 2016-05-19 00:38:43 -05:00 committed by Marc Abonce Seguin
parent 0a44fa8bb7
commit c3daa08537
11 changed files with 399 additions and 14 deletions

64
searx/engines/not_evil.py Normal file
View file

@ -0,0 +1,64 @@
"""
not Evil (Onions)
@website http://hss3uro2hsxfogfq.onion
@provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm)
@using-api no
@results HTML
@stable no
@parse url, title, content
"""
from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['onions']
paging = True
page_size = 20
# search-url
base_url = 'http://hss3uro2hsxfogfq.onion/'
search_url = 'index.php?{query}&hostLimit=20&start={pageno}&numRows={page_size}'
# specific xpath variables
results_xpath = '//*[@id="content"]/div/p'
url_xpath = './span[1]'
title_xpath = './a[1]'
content_xpath = './text()'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * page_size
params['url'] = base_url + search_url.format(pageno=offset,
query=urlencode({'q': query}),
page_size=page_size)
return params
# get response from search-request
def response(resp):
results = []
# needed because otherwise requests guesses wrong encoding
resp.encoding = 'utf8'
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
url = extract_text(result.xpath(url_xpath)[0])
title = extract_text(result.xpath(title_xpath)[0])
content = extract_text(result.xpath(content_xpath))
# append result
results.append({'url': url,
'title': title,
'content': content,
'is_onion': True})
return results