mirror of
https://github.com/searxng/searxng.git
synced 2025-07-12 07:49:22 +02:00
Replace langdetect with fasttext
This commit is contained in:
parent
a6d870d5cf
commit
1f8f8c1e91
6 changed files with 151 additions and 28 deletions
|
@ -15,6 +15,7 @@ from os.path import splitext, join
|
|||
from random import choice
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import fasttext
|
||||
|
||||
from lxml import html
|
||||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
||||
|
@ -22,7 +23,7 @@ from babel.core import get_global
|
|||
|
||||
|
||||
from searx import settings
|
||||
from searx.data import USER_AGENTS
|
||||
from searx.data import USER_AGENTS, data_dir
|
||||
from searx.version import VERSION_TAG
|
||||
from searx.languages import language_codes
|
||||
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
||||
|
@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
|
|||
_XPATH_CACHE: Dict[str, XPath] = {}
|
||||
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
|
||||
"""fasttext model to predict laguage of a search term"""
|
||||
|
||||
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
|
||||
fasttext.FastText.eprint = lambda x: None
|
||||
|
||||
|
||||
class _NotSetClass: # pylint: disable=too-few-public-methods
|
||||
"""Internal class for this module, do not create instance of this class.
|
||||
|
@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
|
|||
# to record xpath_spec
|
||||
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
|
||||
return default
|
||||
|
||||
|
||||
def _get_fasttext_model() -> fasttext.FastText._FastText:
|
||||
global _FASTTEXT_MODEL # pylint: disable=global-statement
|
||||
if _FASTTEXT_MODEL is None:
|
||||
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
|
||||
return _FASTTEXT_MODEL
|
||||
|
||||
|
||||
def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
|
||||
"""https://fasttext.cc/docs/en/language-identification.html"""
|
||||
if not isinstance(text, str):
|
||||
raise ValueError('text must a str')
|
||||
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
|
||||
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
|
||||
return r[0][0].split('__label__')[1]
|
||||
return None
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue