[mod] move extract_text, extract_url to searx.utils

This commit is contained in:
Alexandre Flament 2020-10-02 18:13:56 +02:00
parent ecb9f28869
commit 2006eb4680
39 changed files with 156 additions and 137 deletions

View file

@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
import lxml.etree
from lxml import html
from searx.testing import SearxTestCase
from searx import utils
@ -16,7 +19,30 @@ class TestUtils(SearxTestCase):
self.assertTrue(utils.searx_useragent().startswith('searx'))
def test_html_to_text(self):
html = """
html_str = """
<a href="/testlink" class="link_access_account">
<style>
.toto {
color: red;
}
</style>
<span class="toto">
<span>
<img src="test.jpg" />
</span>
</span>
<span class="titi">
Test text
</span>
<script>value='dummy';</script>
</a>
"""
self.assertIsInstance(utils.html_to_text(html_str), str)
self.assertIsNotNone(utils.html_to_text(html_str))
self.assertEqual(utils.html_to_text(html_str), "Test text")
def test_extract_text(self):
html_str = """
<a href="/testlink" class="link_access_account">
<span class="toto">
<span>
@ -28,9 +54,24 @@ class TestUtils(SearxTestCase):
</span>
</a>
"""
self.assertIsInstance(utils.html_to_text(html), str)
self.assertIsNotNone(utils.html_to_text(html))
self.assertEqual(utils.html_to_text(html), "Test text")
dom = html.fromstring(html_str)
self.assertEqual(utils.extract_text(dom), 'Test text')
self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
def test_extract_url(self):
def f(html_str, search_url):
return utils.extract_url(html.fromstring(html_str), search_url)
self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
with self.assertRaises(lxml.etree.ParserError):
f('', 'https://example.com')
with self.assertRaises(Exception):
utils.extract_url([], 'https://example.com')
def test_html_to_text_invalid(self):
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'