forked from Icycoide/searxng
[fix] bing_news based on RSS output format
This commit is contained in:
parent
cdf931f949
commit
62cc2a5658
3 changed files with 157 additions and 238 deletions
|
@ -6,18 +6,17 @@
|
|||
max. 5000 query/month
|
||||
|
||||
@using-api no (because of query limit)
|
||||
@results HTML (using search portal)
|
||||
@stable no (HTML can change)
|
||||
@parse url, title, content, publishedDate
|
||||
@results RSS (using search portal)
|
||||
@stable yes (except perhaps for the images)
|
||||
@parse url, title, content, publishedDate, thumbnail
|
||||
"""
|
||||
|
||||
from urllib import urlencode
|
||||
from cgi import escape
|
||||
from lxml import html
|
||||
from datetime import datetime, timedelta
|
||||
from urlparse import urlparse, parse_qsl
|
||||
from datetime import datetime
|
||||
from dateutil import parser
|
||||
import re
|
||||
from searx.engines.xpath import extract_text
|
||||
from lxml import etree
|
||||
from searx.utils import list_get
|
||||
|
||||
# engine dependent config
|
||||
categories = ['news']
|
||||
|
@ -26,7 +25,25 @@ language_support = True
|
|||
|
||||
# search-url
|
||||
base_url = 'https://www.bing.com/'
|
||||
search_string = 'news/search?{query}&first={offset}'
|
||||
search_string = 'news/search?{query}&first={offset}&format=RSS'
|
||||
|
||||
|
||||
# remove click
|
||||
def url_cleanup(url_string):
|
||||
parsed_url = urlparse(url_string)
|
||||
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
|
||||
query = dict(parse_qsl(parsed_url.query))
|
||||
return query.get('url', None)
|
||||
return url_string
|
||||
|
||||
|
||||
# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
|
||||
def image_url_cleanup(url_string):
|
||||
parsed_url = urlparse(url_string)
|
||||
if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
|
||||
query = dict(parse_qsl(parsed_url.query))
|
||||
return "https://www.bing.com/th?id=" + query.get('id')
|
||||
return url_string
|
||||
|
||||
|
||||
# do search-request
|
||||
|
@ -42,8 +59,6 @@ def request(query, params):
|
|||
query=urlencode({'q': query, 'setmkt': language}),
|
||||
offset=offset)
|
||||
|
||||
params['cookies']['_FP'] = "ui=en-US"
|
||||
|
||||
params['url'] = base_url + search_path
|
||||
|
||||
return params
|
||||
|
@ -53,50 +68,44 @@ def request(query, params):
|
|||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.content)
|
||||
rss = etree.fromstring(resp.content)
|
||||
|
||||
ns = rss.nsmap
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath('//div[@class="sn_r"]'):
|
||||
link = result.xpath('.//div[@class="newstitle"]/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(link)
|
||||
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
|
||||
content = escape(extract_text(contentXPath))
|
||||
for item in rss.xpath('./channel/item'):
|
||||
# url / title / content
|
||||
url = url_cleanup(item.xpath('./link/text()')[0])
|
||||
title = list_get(item.xpath('./title/text()'), 0, url)
|
||||
content = list_get(item.xpath('./description/text()'), 0, '')
|
||||
|
||||
# parse publishedDate
|
||||
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
|
||||
'//div[contains(@class,"sn_ST")]'
|
||||
'//span[contains(@class,"sn_tm")]')
|
||||
# publishedDate
|
||||
publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
|
||||
try:
|
||||
publishedDate = parser.parse(publishedDate, dayfirst=False)
|
||||
except TypeError:
|
||||
publishedDate = datetime.now()
|
||||
except ValueError:
|
||||
publishedDate = datetime.now()
|
||||
|
||||
publishedDate = escape(extract_text(publishedDateXPath))
|
||||
|
||||
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
|
||||
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
|
||||
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now()\
|
||||
- timedelta(hours=int(timeNumbers[0]))\
|
||||
- timedelta(minutes=int(timeNumbers[1]))
|
||||
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
|
||||
timeNumbers = re.findall(r'\d+', publishedDate)
|
||||
publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
|
||||
else:
|
||||
try:
|
||||
publishedDate = parser.parse(publishedDate, dayfirst=False)
|
||||
except TypeError:
|
||||
publishedDate = datetime.now()
|
||||
except ValueError:
|
||||
publishedDate = datetime.now()
|
||||
# thumbnail
|
||||
thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
|
||||
if thumbnail is not None:
|
||||
thumbnail = image_url_cleanup(thumbnail)
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'publishedDate': publishedDate,
|
||||
'content': content})
|
||||
if thumbnail is not None:
|
||||
results.append({'template': 'videos.html',
|
||||
'url': url,
|
||||
'title': title,
|
||||
'publishedDate': publishedDate,
|
||||
'content': content,
|
||||
'thumbnail': thumbnail})
|
||||
else:
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'publishedDate': publishedDate,
|
||||
'content': content})
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue