[fix] bing_news based on RSS output format

2015-06-04 18:30:08 +02:00 · 2015-06-04 18:30:08 +02:00 · 62cc2a5658
commit 62cc2a5658
parent cdf931f949
3 changed files with 157 additions and 238 deletions
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@ -6,18 +6,17 @@
              max. 5000 query/month

 @using-api   no (because of query limit)
- @results     HTML (using search portal)
- @stable      no (HTML can change)
- @parse       url, title, content, publishedDate
+ @results     RSS (using search portal)
+ @stable      yes (except perhaps for the images)
+ @parse       url, title, content, publishedDate, thumbnail
 """

 from urllib import urlencode
-from cgi import escape
-from lxml import html
-from datetime import datetime, timedelta
+from urlparse import urlparse, parse_qsl
+from datetime import datetime
 from dateutil import parser
-import re
-from searx.engines.xpath import extract_text
+from lxml import etree
+from searx.utils import list_get

 # engine dependent config
 categories = ['news']
@ -26,7 +25,25 @@ language_support = True

 # search-url
 base_url = 'https://www.bing.com/'
-search_string = 'news/search?{query}&first={offset}'
+search_string = 'news/search?{query}&first={offset}&format=RSS'
+
+
+# remove click
+def url_cleanup(url_string):
+    parsed_url = urlparse(url_string)
+    if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
+        query = dict(parse_qsl(parsed_url.query))
+        return query.get('url', None)
+    return url_string
+
+
+# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
+def image_url_cleanup(url_string):
+    parsed_url = urlparse(url_string)
+    if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
+        query = dict(parse_qsl(parsed_url.query))
+        return "https://www.bing.com/th?id=" + query.get('id')
+    return url_string


 # do search-request
@ -42,8 +59,6 @@ def request(query, params):
        query=urlencode({'q': query, 'setmkt': language}),
        offset=offset)

-    params['cookies']['_FP'] = "ui=en-US"
-
    params['url'] = base_url + search_path

    return params
@ -53,50 +68,44 @@ def request(query, params):
 def response(resp):
    results = []

-    dom = html.fromstring(resp.content)
+    rss = etree.fromstring(resp.content)
+
+    ns = rss.nsmap

    # parse results
-    for result in dom.xpath('//div[@class="sn_r"]'):
-        link = result.xpath('.//div[@class="newstitle"]/a')[0]
-        url = link.attrib.get('href')
-        title = extract_text(link)
-        contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
-        content = escape(extract_text(contentXPath))
+    for item in rss.xpath('./channel/item'):
+        # url / title / content
+        url = url_cleanup(item.xpath('./link/text()')[0])
+        title = list_get(item.xpath('./title/text()'), 0, url)
+        content = list_get(item.xpath('./description/text()'), 0, '')

-        # parse publishedDate
-        publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
-                                          '//div[contains(@class,"sn_ST")]'
-                                          '//span[contains(@class,"sn_tm")]')
+        # publishedDate
+        publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
+        try:
+            publishedDate = parser.parse(publishedDate, dayfirst=False)
+        except TypeError:
+            publishedDate = datetime.now()
+        except ValueError:
+            publishedDate = datetime.now()

-        publishedDate = escape(extract_text(publishedDateXPath))
-
-        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
-            timeNumbers = re.findall(r'\d+', publishedDate)
-            publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
-        elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
-            timeNumbers = re.findall(r'\d+', publishedDate)
-            publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
-        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
-            timeNumbers = re.findall(r'\d+', publishedDate)
-            publishedDate = datetime.now()\
-                - timedelta(hours=int(timeNumbers[0]))\
-                - timedelta(minutes=int(timeNumbers[1]))
-        elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
-            timeNumbers = re.findall(r'\d+', publishedDate)
-            publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
-        else:
-            try:
-                publishedDate = parser.parse(publishedDate, dayfirst=False)
-            except TypeError:
-                publishedDate = datetime.now()
-            except ValueError:
-                publishedDate = datetime.now()
+        # thumbnail
+        thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
+        if thumbnail is not None:
+            thumbnail = image_url_cleanup(thumbnail)

        # append result
-        results.append({'url': url,
-                        'title': title,
-                        'publishedDate': publishedDate,
-                        'content': content})
+        if thumbnail is not None:
+            results.append({'template': 'videos.html',
+                            'url': url,
+                            'title': title,
+                            'publishedDate': publishedDate,
+                            'content': content,
+                            'thumbnail': thumbnail})
+        else:
+            results.append({'url': url,
+                            'title': title,
+                            'publishedDate': publishedDate,
+                            'content': content})

    # return results
    return results