forked from Icycoide/searxng
Merge branch 'unit-tests' of https://github.com/Cqoicebordel/searx into Cqoicebordel-unit-tests
Conflicts: searx/tests/test_engines.py
This commit is contained in:
commit
7f865356f9
45 changed files with 3692 additions and 71 deletions
|
@ -14,6 +14,7 @@
|
|||
from urllib import urlencode
|
||||
from cgi import escape
|
||||
from lxml import html
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
|
@ -55,8 +56,8 @@ def response(resp):
|
|||
for result in dom.xpath('//div[@class="sa_cc"]'):
|
||||
link = result.xpath('.//h3/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = ' '.join(link.xpath('.//text()'))
|
||||
content = escape(' '.join(result.xpath('.//p//text()')))
|
||||
title = extract_text(link)
|
||||
content = escape(extract_text(result.xpath('.//p')))
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
|
@ -71,8 +72,8 @@ def response(resp):
|
|||
for result in dom.xpath('//li[@class="b_algo"]'):
|
||||
link = result.xpath('.//h2/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = ' '.join(link.xpath('.//text()'))
|
||||
content = escape(' '.join(result.xpath('.//p//text()')))
|
||||
title = extract_text(link)
|
||||
content = escape(extract_text(result.xpath('.//p')))
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
|
|
|
@ -33,7 +33,10 @@ def request(query, params):
|
|||
offset = (params['pageno'] - 1) * 10 + 1
|
||||
|
||||
# required for cookie
|
||||
language = 'en-US'
|
||||
if params['language'] == 'all':
|
||||
language = 'en-US'
|
||||
else:
|
||||
language = params['language'].replace('_', '-')
|
||||
|
||||
search_path = search_string.format(
|
||||
query=urlencode({'q': query}),
|
||||
|
|
|
@ -15,6 +15,7 @@ from lxml import html
|
|||
from datetime import datetime, timedelta
|
||||
from dateutil import parser
|
||||
import re
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['news']
|
||||
|
@ -42,6 +43,7 @@ def request(query, params):
|
|||
params['cookies']['_FP'] = "ui=en-US"
|
||||
|
||||
params['url'] = base_url + search_path
|
||||
|
||||
return params
|
||||
|
||||
|
||||
|
@ -55,44 +57,35 @@ def response(resp):
|
|||
for result in dom.xpath('//div[@class="sn_r"]'):
|
||||
link = result.xpath('.//div[@class="newstitle"]/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = ' '.join(link.xpath('.//text()'))
|
||||
contentXPath = result.xpath('.//div[@class="sn_txt"]/div'
|
||||
'//span[@class="sn_snip"]//text()')
|
||||
if contentXPath is not None:
|
||||
content = escape(' '.join(contentXPath))
|
||||
title = extract_text(link)
|
||||
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
|
||||
content = escape(extract_text(contentXPath))
|
||||
|
||||
# parse publishedDate
|
||||
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
|
||||
'//span[contains(@class,"sn_ST")]'
|
||||
'//span[contains(@class,"sn_tm")]'
|
||||
'//text()')
|
||||
if publishedDateXPath is not None:
|
||||
publishedDate = escape(' '.join(publishedDateXPath))
|
||||
'//span[contains(@class,"sn_tm")]')
|
||||
|
||||
publishedDate = escape(extract_text(publishedDateXPath))
|
||||
|
||||
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now()\
|
||||
- timedelta(minutes=int(timeNumbers[0]))
|
||||
publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
|
||||
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now()\
|
||||
- timedelta(hours=int(timeNumbers[0]))
|
||||
elif re.match("^[0-9]+ hour(s|),"
|
||||
" [0-9]+ minute(s|) ago$", publishedDate):
|
||||
publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
|
||||
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now()\
|
||||
- timedelta(hours=int(timeNumbers[0]))\
|
||||
- timedelta(minutes=int(timeNumbers[1]))
|
||||
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now()\
|
||||
- timedelta(days=int(timeNumbers[0]))
|
||||
publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
|
||||
else:
|
||||
try:
|
||||
# FIXME use params['language'] to parse either mm/dd or dd/mm
|
||||
publishedDate = parser.parse(publishedDate, dayfirst=False)
|
||||
except TypeError:
|
||||
# FIXME
|
||||
publishedDate = datetime.now()
|
||||
|
||||
# append result
|
||||
|
|
|
@ -23,11 +23,6 @@ paging = True
|
|||
url = 'https://btdigg.org'
|
||||
search_url = url + '/search?q={search_term}&p={pageno}'
|
||||
|
||||
# specific xpath variables
|
||||
magnet_xpath = './/a[@title="Torrent magnet link"]'
|
||||
torrent_xpath = './/a[@title="Download torrent file"]'
|
||||
content_xpath = './/span[@class="font11px lightgrey block"]'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
|
@ -52,8 +47,8 @@ def response(resp):
|
|||
# parse results
|
||||
for result in search_res:
|
||||
link = result.xpath('.//td[@class="torrent_name"]//a')[0]
|
||||
href = urljoin(url, link.attrib['href'])
|
||||
title = escape(extract_text(link.xpath('.//text()')))
|
||||
href = urljoin(url, link.attrib.get('href'))
|
||||
title = escape(extract_text(link))
|
||||
content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
|
||||
content = "<br />".join(content.split("\n"))
|
||||
|
||||
|
@ -81,7 +76,7 @@ def response(resp):
|
|||
filesize = int(filesize * 1024 * 1024 * 1024)
|
||||
elif filesize_multiplier == 'MB':
|
||||
filesize = int(filesize * 1024 * 1024)
|
||||
elif filesize_multiplier == 'kb':
|
||||
elif filesize_multiplier == 'KB':
|
||||
filesize = int(filesize * 1024)
|
||||
except:
|
||||
filesize = None
|
||||
|
|
|
@ -14,6 +14,7 @@ from urllib import urlencode
|
|||
from urlparse import urljoin
|
||||
from lxml import html
|
||||
import re
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images']
|
||||
|
@ -50,9 +51,9 @@ def response(resp):
|
|||
for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
|
||||
link = result.xpath('.//a[contains(@class, "thumb")]')[0]
|
||||
url = urljoin(base_url, link.attrib.get('href'))
|
||||
title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') # noqa
|
||||
title = ''.join(title_links[0].xpath('.//text()'))
|
||||
thumbnail_src = link.xpath('.//img')[0].attrib['src']
|
||||
title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]')
|
||||
title = extract_text(title_links[0])
|
||||
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
|
||||
img_src = regex.sub('/', thumbnail_src)
|
||||
|
||||
# append result
|
||||
|
|
|
@ -44,7 +44,7 @@ def response(resp):
|
|||
|
||||
search_result = loads(resp.text)
|
||||
|
||||
if search_result['html'] == '':
|
||||
if 'html' not in search_result or search_result['html'] == '':
|
||||
return results
|
||||
|
||||
dom = html.fromstring(search_result['html'])
|
||||
|
|
|
@ -21,7 +21,7 @@ logger = logger.getChild('flickr-noapi')
|
|||
categories = ['images']
|
||||
|
||||
url = 'https://secure.flickr.com/'
|
||||
search_url = url+'search/?{query}&page={page}'
|
||||
search_url = url + 'search/?{query}&page={page}'
|
||||
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
|
||||
regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
|
||||
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
|
||||
|
|
|
@ -18,7 +18,7 @@ paging = True
|
|||
|
||||
# search-url
|
||||
url = 'https://ajax.googleapis.com/'
|
||||
search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # noqa
|
||||
search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}'
|
||||
|
||||
|
||||
# do search-request
|
||||
|
@ -45,14 +45,14 @@ def response(resp):
|
|||
for result in search_res['responseData']['results']:
|
||||
href = result['originalContextUrl']
|
||||
title = result['title']
|
||||
if not result['url']:
|
||||
if 'url' not in result:
|
||||
continue
|
||||
thumbnail_src = result['tbUrl']
|
||||
|
||||
# append result
|
||||
results.append({'url': href,
|
||||
'title': title,
|
||||
'content': '',
|
||||
'content': result['content'],
|
||||
'thumbnail_src': thumbnail_src,
|
||||
'img_src': unquote(result['url']),
|
||||
'template': 'images.html'})
|
||||
|
|
|
@ -20,7 +20,7 @@ language_support = True
|
|||
|
||||
# engine dependent config
|
||||
url = 'https://ajax.googleapis.com/'
|
||||
search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa
|
||||
search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={lang}'
|
||||
|
||||
|
||||
# do search-request
|
||||
|
@ -33,7 +33,7 @@ def request(query, params):
|
|||
|
||||
params['url'] = search_url.format(offset=offset,
|
||||
query=urlencode({'q': query}),
|
||||
language=language)
|
||||
lang=language)
|
||||
|
||||
return params
|
||||
|
||||
|
@ -52,6 +52,8 @@ def response(resp):
|
|||
for result in search_res['responseData']['results']:
|
||||
# parse publishedDate
|
||||
publishedDate = parser.parse(result['publishedDate'])
|
||||
if 'url' not in result:
|
||||
continue
|
||||
|
||||
# append result
|
||||
results.append({'url': result['unescapedUrl'],
|
||||
|
|
|
@ -13,6 +13,7 @@ from cgi import escape
|
|||
from urllib import quote
|
||||
from lxml import html
|
||||
from operator import itemgetter
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos', 'music', 'files']
|
||||
|
@ -56,9 +57,8 @@ def response(resp):
|
|||
for result in search_res[1:]:
|
||||
link = result.xpath('.//a[@class="cellMainLink"]')[0]
|
||||
href = urljoin(url, link.attrib['href'])
|
||||
title = ' '.join(link.xpath('.//text()'))
|
||||
content = escape(html.tostring(result.xpath(content_xpath)[0],
|
||||
method="text"))
|
||||
title = extract_text(link)
|
||||
content = escape(extract_text(result.xpath(content_xpath)))
|
||||
seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
|
||||
leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
|
||||
filesize = result.xpath('.//td[contains(@class, "nobr")]/text()')[0]
|
||||
|
@ -88,7 +88,7 @@ def response(resp):
|
|||
filesize = int(filesize * 1024 * 1024 * 1024)
|
||||
elif filesize_multiplier == 'MB':
|
||||
filesize = int(filesize * 1024 * 1024)
|
||||
elif filesize_multiplier == 'kb':
|
||||
elif filesize_multiplier == 'KB':
|
||||
filesize = int(filesize * 1024)
|
||||
except:
|
||||
filesize = None
|
||||
|
|
|
@ -13,6 +13,7 @@ from cgi import escape
|
|||
from urllib import quote
|
||||
from lxml import html
|
||||
from operator import itemgetter
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos', 'music', 'files']
|
||||
|
@ -29,7 +30,8 @@ search_types = {'files': '0',
|
|||
|
||||
# specific xpath variables
|
||||
magnet_xpath = './/a[@title="Download this torrent using magnet"]'
|
||||
content_xpath = './/font[@class="detDesc"]//text()'
|
||||
torrent_xpath = './/a[@title="Download this torrent"]'
|
||||
content_xpath = './/font[@class="detDesc"]'
|
||||
|
||||
|
||||
# do search-request
|
||||
|
@ -59,8 +61,8 @@ def response(resp):
|
|||
for result in search_res[1:]:
|
||||
link = result.xpath('.//div[@class="detName"]//a')[0]
|
||||
href = urljoin(url, link.attrib.get('href'))
|
||||
title = ' '.join(link.xpath('.//text()'))
|
||||
content = escape(' '.join(result.xpath(content_xpath)))
|
||||
title = extract_text(link)
|
||||
content = escape(extract_text(result.xpath(content_xpath)))
|
||||
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
|
||||
|
||||
# convert seed to int if possible
|
||||
|
@ -76,6 +78,7 @@ def response(resp):
|
|||
leech = 0
|
||||
|
||||
magnetlink = result.xpath(magnet_xpath)[0]
|
||||
torrentfile = result.xpath(torrent_xpath)[0]
|
||||
|
||||
# append result
|
||||
results.append({'url': href,
|
||||
|
@ -83,7 +86,8 @@ def response(resp):
|
|||
'content': content,
|
||||
'seed': seed,
|
||||
'leech': leech,
|
||||
'magnetlink': magnetlink.attrib['href'],
|
||||
'magnetlink': magnetlink.attrib.get('href'),
|
||||
'torrentfile': torrentfile.attrib.get('href'),
|
||||
'template': 'torrent.html'})
|
||||
|
||||
# return results sorted by seeder
|
||||
|
|
|
@ -42,7 +42,7 @@ def response(resp):
|
|||
search_results = loads(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in search_results['results']:
|
||||
for result in search_results.get('results', []):
|
||||
href = result['url']
|
||||
title = "" + result['name'] + " - " + result['filename']
|
||||
repo = result['repo']
|
||||
|
|
|
@ -35,7 +35,7 @@ def response(resp):
|
|||
search_results = loads(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in search_results['results']:
|
||||
for result in search_results.get('results', []):
|
||||
href = result['url']
|
||||
title = "[" + result['type'] + "] " +\
|
||||
result['namespace'] +\
|
||||
|
|
|
@ -12,6 +12,7 @@ from urlparse import urljoin
|
|||
from cgi import escape
|
||||
from urllib import urlencode
|
||||
from lxml import html
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['it']
|
||||
|
@ -24,8 +25,7 @@ search_url = url+'search?{query}&page={pageno}'
|
|||
# specific xpath variables
|
||||
results_xpath = '//div[contains(@class,"question-summary")]'
|
||||
link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
|
||||
title_xpath = './/text()'
|
||||
content_xpath = './/div[@class="excerpt"]//text()'
|
||||
content_xpath = './/div[@class="excerpt"]'
|
||||
|
||||
|
||||
# do search-request
|
||||
|
@ -46,8 +46,8 @@ def response(resp):
|
|||
for result in dom.xpath(results_xpath):
|
||||
link = result.xpath(link_xpath)[0]
|
||||
href = urljoin(url, link.attrib.get('href'))
|
||||
title = escape(' '.join(link.xpath(title_xpath)))
|
||||
content = escape(' '.join(result.xpath(content_xpath)))
|
||||
title = escape(extract_text(link))
|
||||
content = escape(extract_text(result.xpath(content_xpath)))
|
||||
|
||||
# append result
|
||||
results.append({'url': href,
|
||||
|
|
|
@ -59,8 +59,7 @@ def response(resp):
|
|||
url = base_url + videoid
|
||||
title = p.unescape(extract_text(result.xpath(title_xpath)))
|
||||
thumbnail = extract_text(result.xpath(content_xpath)[0])
|
||||
publishedDate = parser.parse(extract_text(
|
||||
result.xpath(publishedDate_xpath)[0]))
|
||||
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
|
||||
embedded = embedded_url.format(videoid=videoid)
|
||||
|
||||
# append result
|
||||
|
|
|
@ -15,6 +15,7 @@ from urllib import urlencode
|
|||
from urlparse import urljoin
|
||||
from lxml import html
|
||||
import re
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images']
|
||||
|
@ -22,7 +23,7 @@ paging = True
|
|||
|
||||
# search-url
|
||||
base_url = 'https://500px.com'
|
||||
search_url = base_url+'/search?search?page={pageno}&type=photos&{query}'
|
||||
search_url = base_url + '/search?search?page={pageno}&type=photos&{query}'
|
||||
|
||||
|
||||
# do search-request
|
||||
|
@ -44,11 +45,11 @@ def response(resp):
|
|||
for result in dom.xpath('//div[@class="photo"]'):
|
||||
link = result.xpath('.//a')[0]
|
||||
url = urljoin(base_url, link.attrib.get('href'))
|
||||
title = result.xpath('.//div[@class="title"]//text()')[0]
|
||||
thumbnail_src = link.xpath('.//img')[0].attrib['src']
|
||||
title = extract_text(result.xpath('.//div[@class="title"]'))
|
||||
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
|
||||
# To have a bigger thumbnail, uncomment the next line
|
||||
#thumbnail_src = regex.sub('4.jpg', thumbnail_src)
|
||||
content = result.xpath('.//div[@class="info"]//text()')[0]
|
||||
# thumbnail_src = regex.sub('4.jpg', thumbnail_src)
|
||||
content = extract_text(result.xpath('.//div[@class="info"]'))
|
||||
img_src = regex.sub('2048.jpg', thumbnail_src)
|
||||
|
||||
# append result
|
||||
|
|
|
@ -28,13 +28,13 @@ def extract_text(xpath_results):
|
|||
result = ''
|
||||
for e in xpath_results:
|
||||
result = result + extract_text(e)
|
||||
return result
|
||||
return result.strip()
|
||||
elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
|
||||
# it's a string
|
||||
return ''.join(xpath_results)
|
||||
else:
|
||||
# it's a element
|
||||
return html_to_text(xpath_results.text_content())
|
||||
return html_to_text(xpath_results.text_content()).strip()
|
||||
|
||||
|
||||
def extract_url(xpath_results, search_url):
|
||||
|
|
|
@ -57,7 +57,7 @@ def response(resp):
|
|||
url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
|
||||
|
||||
if not url:
|
||||
return
|
||||
continue
|
||||
|
||||
# remove tracking
|
||||
url = url[0].replace('feature=youtube_gdata', '')
|
||||
|
@ -73,7 +73,7 @@ def response(resp):
|
|||
pubdate = result['published']['$t']
|
||||
publishedDate = parser.parse(pubdate)
|
||||
|
||||
if result['media$group']['media$thumbnail']:
|
||||
if 'media$thumbnail' in result['media$group']:
|
||||
thumbnail = result['media$group']['media$thumbnail'][0]['url']
|
||||
|
||||
content = result['content']['$t']
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue