refactor engine's search language handling

Add match_language function in utils to match any user given
language code with a list of engine's supported languages.

Also add language_aliases dict on each engine to translate
standard language codes into the custom codes used by the engine.
This commit is contained in:
Marc Abonce Seguin 2018-02-28 22:30:48 -06:00
parent d1eae9359f
commit 772c048d01
42 changed files with 275 additions and 171 deletions

View file

@ -19,12 +19,17 @@ class TestArchLinuxEngine(SearxTestCase):
query = 'test_query'
dic = defaultdict(dict)
dic['pageno'] = 1
dic['language'] = 'en_US'
dic['language'] = 'en-US'
params = archlinux.request(query, dic)
self.assertTrue('url' in params)
self.assertTrue(query in params['url'])
self.assertTrue('wiki.archlinux.org' in params['url'])
for lang, name in archlinux.main_langs:
dic['language'] = lang
params = archlinux.request(query, dic)
self.assertTrue(name in params['url'])
for lang, domain in domains.items():
dic['language'] = lang
params = archlinux.request(query, dic)

View file

@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
class TestBingEngine(SearxTestCase):
def test_request(self):
bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR']
query = u'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0

View file

@ -9,7 +9,6 @@ class TestBingImagesEngine(SearxTestCase):
def test_request(self):
bing_images.supported_languages = ['fr-FR', 'en-US']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -8,10 +8,11 @@ import lxml
class TestBingNewsEngine(SearxTestCase):
def test_request(self):
bing_news.supported_languages = ['en', 'fr']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
dicto['time_range'] = ''
params = bing_news.request(query, dicto)
self.assertIn('url', params)

View file

@ -9,7 +9,6 @@ class TestBingVideosEngine(SearxTestCase):
def test_request(self):
bing_videos.supported_languages = ['fr-FR', 'en-US']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -8,10 +8,11 @@ from searx.testing import SearxTestCase
class TestDailymotionEngine(SearxTestCase):
def test_request(self):
dailymotion.supported_languages = ['en', 'fr']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
params = dailymotion.request(query, dicto)
self.assertTrue('url' in params)
self.assertTrue(query in params['url'])

View file

@ -1,18 +1,21 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
import mock
from searx.engines import duckduckgo
from searx.engines import load_engine, duckduckgo
from searx.testing import SearxTestCase
class TestDuckduckgoEngine(SearxTestCase):
def test_request(self):
duckduckgo = load_engine({'engine': 'duckduckgo', 'name': 'duckduckgo'})
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['language'] = 'de-CH'
dicto['time_range'] = ''
dicto['language'] = 'de-CH'
params = duckduckgo.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])
@ -20,16 +23,19 @@ class TestDuckduckgoEngine(SearxTestCase):
self.assertIn('ch-de', params['url'])
self.assertIn('s=0', params['url'])
# when ddg uses non standard code
# when ddg uses non standard codes
dicto['language'] = 'zh-HK'
params = duckduckgo.request(query, dicto)
self.assertIn('hk-tzh', params['url'])
dicto['language'] = 'en-GB'
params = duckduckgo.request(query, dicto)
self.assertIn('uk-en', params['url'])
# no country given
duckduckgo.supported_languages = ['de-CH', 'en-US']
dicto['language'] = 'de'
dicto['language'] = 'en'
params = duckduckgo.request(query, dicto)
self.assertIn('ch-de', params['url'])
self.assertIn('us-en', params['url'])
def test_no_url_in_request_year_time_range(self):
dicto = defaultdict(dict)

View file

@ -18,6 +18,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
self.assertEqual(result, 'Text in link')
def test_request(self):
duckduckgo_definitions.supported_languages = ['en-US', 'es-ES']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -9,7 +9,6 @@ class TestDuckduckgoImagesEngine(SearxTestCase):
def test_request(self):
duckduckgo_images.supported_languages = ['de-CH', 'en-US']
query = 'test_query'
dicto = defaultdict(dict)
dicto['is_test'] = True

View file

@ -15,6 +15,8 @@ class TestGoogleEngine(SearxTestCase):
return response
def test_request(self):
google.supported_languages = ['en', 'fr', 'zh-CN']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
@ -31,6 +33,11 @@ class TestGoogleEngine(SearxTestCase):
self.assertIn('google.co', params['url'])
self.assertIn('en', params['headers']['Accept-Language'])
dicto['language'] = 'zh'
params = google.request(query, dicto)
self.assertIn('google.com', params['url'])
self.assertIn('zh-CN', params['headers']['Accept-Language'])
def test_response(self):
self.assertRaises(AttributeError, google.response, None)
self.assertRaises(AttributeError, google.response, [])

View file

@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
class TestGoogleNewsEngine(SearxTestCase):
def test_request(self):
google_news.supported_languages = ['en-US', 'fr-FR']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
class TestQwantEngine(SearxTestCase):
def test_request(self):
qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0
@ -26,7 +27,6 @@ class TestQwantEngine(SearxTestCase):
self.assertIn('en_us', params['url'])
self.assertIn('news', params['url'])
qwant.supported_languages = ['en', 'fr-FR', 'fr-CA']
dicto['language'] = 'fr'
params = qwant.request(query, dicto)
self.assertIn('fr_fr', params['url'])

View file

@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
class TestSwisscowsEngine(SearxTestCase):
def test_request(self):
swisscows.supported_languages = ['de-AT', 'de-DE']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
class TestWikidataEngine(SearxTestCase):
def test_request(self):
wikidata.supported_languages = ['en', 'es']
query = 'test_query'
dicto = defaultdict(dict)
dicto['language'] = 'en-US'

View file

@ -25,11 +25,12 @@ class TestYahooEngine(SearxTestCase):
self.assertEqual('https://this.is.the.url/', url)
def test_request(self):
yahoo.supported_languages = ['en', 'fr', 'zh-CHT', 'zh-CHS']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['time_range'] = ''
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
params = yahoo.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])
@ -39,6 +40,16 @@ class TestYahooEngine(SearxTestCase):
self.assertIn('sB', params['cookies'])
self.assertIn('fr', params['cookies']['sB'])
dicto['language'] = 'zh'
params = yahoo.request(query, dicto)
self.assertIn('zh_chs', params['url'])
self.assertIn('zh_chs', params['cookies']['sB'])
dicto['language'] = 'zh-TW'
params = yahoo.request(query, dicto)
self.assertIn('zh_cht', params['url'])
self.assertIn('zh_cht', params['cookies']['sB'])
def test_no_url_in_request_year_time_range(self):
dicto = defaultdict(dict)
query = 'test_query'
@ -168,5 +179,5 @@ class TestYahooEngine(SearxTestCase):
self.assertEqual(type(languages), list)
self.assertEqual(len(languages), 3)
self.assertIn('ar', languages)
self.assertIn('zh-chs', languages)
self.assertIn('zh-cht', languages)
self.assertIn('zh-CHS', languages)
self.assertIn('zh-CHT', languages)

View file

@ -9,10 +9,11 @@ from searx.testing import SearxTestCase
class TestYahooNewsEngine(SearxTestCase):
def test_request(self):
yahoo_news.supported_languages = ['en', 'fr']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
params = yahoo_news.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])

View file

@ -65,6 +65,31 @@ class TestUtils(SearxTestCase):
for test_url, expected in data:
self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
def test_match_language(self):
self.assertEqual(utils.match_language('es', ['es']), 'es')
self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
aliases = {'en-GB': 'en-UK', 'he': 'iw'}
# guess country
self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
# language aliases
self.assertEqual(utils.match_language('iw', ['he']), 'he')
self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
class TestHTMLTextExtractor(SearxTestCase):