Merge pull request #2241 from dalf/move-extract-text-and-url

Move the extract_text  and extract_url functions to searx.utils
This commit is contained in:
Alexandre Flament 2020-10-04 09:06:20 +02:00 committed by GitHub
commit b728cb610b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
39 changed files with 302 additions and 149 deletions

View file

@ -1,7 +1,6 @@
from urllib.parse import quote, urljoin
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
from searx.utils import extract_text, get_torrent_size
url = 'https://1337x.to/'

View file

@ -11,8 +11,7 @@
from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size, int_or_zero
from searx.utils import extract_text, get_torrent_size, int_or_zero
# engine dependent config
categories = ['files', 'images', 'videos', 'music']

View file

@ -11,7 +11,7 @@
from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config

View file

@ -13,7 +13,7 @@
from urllib.parse import urlencode, urljoin
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config
categories = ['it']

View file

@ -17,8 +17,7 @@ import re
from urllib.parse import urlencode
from lxml import html
from searx import logger, utils
from searx.engines.xpath import extract_text
from searx.utils import match_language, gen_useragent, eval_xpath
from searx.utils import extract_text, match_language, gen_useragent, eval_xpath
logger = logger.getChild('bing engine')

View file

@ -13,8 +13,7 @@
from lxml import html
from operator import itemgetter
from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
from searx.utils import extract_text, get_torrent_size
# engine dependent config
categories = ['videos', 'music', 'files']

View file

@ -15,7 +15,7 @@
from lxml import html
import re
from urllib.parse import urlencode
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config

View file

@ -12,8 +12,7 @@
from urllib.parse import urljoin
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
from searx.utils import extract_text, get_torrent_size
categories = ['videos', 'music', 'files']

View file

@ -11,8 +11,7 @@
from urllib.parse import urlencode
from lxml.html import fromstring
from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
from searx.utils import extract_text, eval_xpath
# engine dependent config
categories = ['general'] # TODO , 'images', 'music', 'videos', 'files'

View file

@ -16,9 +16,8 @@
from lxml.html import fromstring
from json import loads
from urllib.parse import urlencode
from searx.engines.xpath import extract_text
from searx.poolrequests import get
from searx.utils import match_language, eval_xpath
from searx.utils import extract_text, match_language, eval_xpath
# engine dependent config
categories = ['general']

View file

@ -13,9 +13,8 @@ import json
from urllib.parse import urlencode
from lxml import html
from re import compile
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
from searx.utils import html_to_text, match_language
from searx.utils import extract_text, html_to_text, match_language
url = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'

View file

@ -15,12 +15,12 @@
from json import loads
from urllib.parse import urlencode
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import (
_fetch_supported_languages, supported_languages_url,
get_region_code, language_aliases
)
from searx.poolrequests import get
from searx.utils import extract_text
# engine dependent config
categories = ['images']

View file

@ -11,8 +11,7 @@
from lxml import html, etree
import re
from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
from searx.utils import extract_text, eval_xpath
from searx import logger
categories = ['general']

View file

@ -11,8 +11,7 @@
from lxml import html
from urllib.parse import quote
from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
from searx.utils import extract_text, eval_xpath
categories = ['general']
paging = False

View file

@ -11,7 +11,7 @@
from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config
categories = ['files']

View file

@ -13,7 +13,7 @@
from html import escape
from urllib.parse import urljoin, urlencode
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config
categories = ['it']

View file

@ -13,7 +13,7 @@
from urllib.parse import urlencode, urljoin
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config
categories = ['it']

View file

@ -21,9 +21,8 @@ Definitions`_.
from urllib.parse import urlencode, urlparse
from lxml import html
from flask_babel import gettext
from searx.engines.xpath import extract_text
from searx import logger
from searx.utils import match_language, eval_xpath
from searx.utils import match_language, extract_text, eval_xpath
logger = logger.getChild('google engine')

View file

@ -28,8 +28,7 @@ from urllib.parse import urlencode, urlparse, unquote
from lxml import html
from flask_babel import gettext
from searx import logger
from searx.utils import eval_xpath
from searx.engines.xpath import extract_text
from searx.utils import extract_text, eval_xpath
# pylint: disable=unused-import
from searx.engines.google import (

View file

@ -14,7 +14,7 @@ from datetime import date, timedelta
from json import loads
from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import extract_text
import re
# engine dependent config

View file

@ -16,7 +16,7 @@ from urllib.parse import urlencode
from lxml import html
from dateutil import parser
from html.parser import HTMLParser
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config

View file

@ -13,8 +13,7 @@
from lxml import html
from operator import itemgetter
from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size, convert_str_to_int
from searx.utils import extract_text, get_torrent_size, convert_str_to_int
# engine dependent config
categories = ['videos', 'music', 'files']

View file

@ -11,8 +11,7 @@
from lxml import html
from urllib.parse import urlencode
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size, int_or_zero
from searx.utils import extract_text, get_torrent_size, int_or_zero
# engine dependent config
categories = ['files', 'images', 'videos', 'music']

View file

@ -13,8 +13,7 @@ from datetime import datetime
from operator import itemgetter
from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
from searx.utils import extract_text, get_torrent_size
# engine dependent config
categories = ["videos", "music", "files"]

View file

@ -12,7 +12,7 @@ from lxml import html
from json import loads
from operator import itemgetter
from urllib.parse import quote, urljoin
from searx.engines.xpath import extract_text
from searx.utils import extract_text
url = 'https://seedpeer.me/'

View file

@ -12,7 +12,7 @@
from urllib.parse import urlencode, urljoin
from lxml import html
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config
categories = ['it']

View file

@ -17,9 +17,8 @@ import re
from unicodedata import normalize, combining
from babel import Locale
from babel.localedata import locale_identifiers
from searx.engines.xpath import extract_text
from searx.languages import language_codes
from searx.utils import eval_xpath, match_language
from searx.utils import extract_text, eval_xpath, match_language
# engine dependent config
categories = ['general']

View file

@ -13,9 +13,8 @@
import re
from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text
from datetime import datetime
from searx.utils import get_torrent_size, int_or_zero
from searx.utils import extract_text, get_torrent_size, int_or_zero
# engine dependent config
categories = ['files', 'videos', 'music']

View file

@ -15,8 +15,7 @@ import re
from urllib.parse import urlencode
from lxml import html
from datetime import datetime
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
from searx.utils import extract_text, get_torrent_size
# engine dependent config
categories = ['files', 'videos', 'music']

View file

@ -15,7 +15,7 @@
from urllib.parse import urlencode, urljoin
from lxml import html
from datetime import datetime
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config
categories = ['social media']

View file

@ -13,9 +13,8 @@
from searx import logger
from searx.poolrequests import get
from searx.engines.xpath import extract_text
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
from searx.utils import match_language, eval_xpath
from searx.utils import extract_text, match_language, eval_xpath
from urllib.parse import urlencode
from json import loads

View file

@ -12,7 +12,7 @@
from lxml import html
from urllib.parse import urlencode, urljoin
from searx.engines.xpath import extract_text
from searx.utils import extract_text
# engine dependent config
categories = ['images']

View file

@ -1,7 +1,6 @@
from urllib.parse import unquote, urlencode, urljoin, urlparse
from lxml import html
from lxml.etree import _ElementStringResult, _ElementUnicodeResult
from searx.utils import html_to_text, eval_xpath
from urllib.parse import urlencode
from searx.utils import extract_text, extract_url, eval_xpath
search_url = None
url_xpath = None
@ -21,76 +20,6 @@ page_size = 1
first_page_num = 1
'''
if xpath_results is list, extract the text from each result and concat the list
if xpath_results is a xml element, extract all the text node from it
( text_content() method from lxml )
if xpath_results is a string element, then it's already done
'''
def extract_text(xpath_results):
if type(xpath_results) == list:
# it's list of result : concat everything using recursive call
result = ''
for e in xpath_results:
result = result + extract_text(e)
return result.strip()
elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
# it's a string
return ''.join(xpath_results)
else:
# it's a element
text = html.tostring(
xpath_results, encoding='unicode', method='text', with_tail=False
)
text = text.strip().replace('\n', ' ')
return ' '.join(text.split())
def extract_url(xpath_results, search_url):
if xpath_results == []:
raise Exception('Empty url resultset')
url = extract_text(xpath_results)
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url)
# fix relative urls that fall through the crack
if '://' not in url:
url = urljoin(search_url, url)
# normalize url
url = normalize_url(url)
return url
def normalize_url(url):
parsed_url = urlparse(url)
# add a / at this end of the url if there is no path
if not parsed_url.netloc:
raise Exception('Cannot parse url')
if not parsed_url.path:
url += '/'
# FIXME : hack for yahoo
if parsed_url.hostname == 'search.yahoo.com'\
and parsed_url.path.startswith('/r'):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
return unquote(p[mark + 3:]).decode()
return url
def request(query, params):
query = urlencode({'q': query})[2:]

View file

@ -13,8 +13,7 @@
from urllib.parse import unquote, urlencode
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.utils import match_language, eval_xpath
from searx.utils import extract_text, extract_url, match_language, eval_xpath
# engine dependent config
categories = ['general']

View file

@ -13,12 +13,11 @@ import re
from datetime import datetime, timedelta
from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.engines.yahoo import (
parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
)
from dateutil import parser
from searx.utils import match_language
from searx.utils import extract_text, extract_url, match_language
# engine dependent config
categories = ['news']

View file

@ -12,8 +12,7 @@ from lxml import html
from operator import itemgetter
from datetime import datetime
from urllib.parse import quote
from searx.engines.xpath import extract_text
from searx.utils import get_torrent_size
from searx.utils import extract_text, get_torrent_size
from searx.poolrequests import get as http_get
# engine dependent config

View file

@ -11,8 +11,7 @@
from functools import reduce
from json import loads
from urllib.parse import quote_plus
from searx.engines.xpath import extract_text
from searx.utils import list_get
from searx.utils import extract_text, list_get
# engine dependent config
categories = ['videos', 'music']