mirror of
https://github.com/searxng/searxng.git
synced 2025-07-13 00:09:18 +02:00
js_variable_to_python: add tests, handle more JS syntax
The tests from chompjs are copied. The comment out tests do not pass. The implementation of js_variable_to_python has been updated: * in the main looop, try to make the four different cases more clear * handle decimal number like "-.5", "5." or "- 5" (without double quote) * the character ` is seen a string delimiter as intended in JS * the identifiers follow JS specification ($, _, letters and numbers)
This commit is contained in:
parent
ec540a967a
commit
72f5e7cfb8
3 changed files with 392 additions and 46 deletions
154
searx/utils.py
154
searx/utils.py
|
@ -38,9 +38,14 @@ _BLOCKED_TAGS = ('script', 'style')
|
|||
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
|
||||
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
|
||||
|
||||
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
|
||||
_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
|
||||
_JS_DECIMAL_RE = re.compile(r":\s*\.")
|
||||
_JS_STRING_DELIMITERS = re.compile(r'(["\'`])')
|
||||
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)')
|
||||
_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined')
|
||||
_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)")
|
||||
_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)")
|
||||
_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])")
|
||||
_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)')
|
||||
_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
|
||||
|
||||
_STORAGE_UNIT_VALUE: Dict[str, int] = {
|
||||
'TB': 1024 * 1024 * 1024 * 1024,
|
||||
|
@ -652,12 +657,45 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
|
|||
return None
|
||||
|
||||
|
||||
def _j2p_process_escape(match):
|
||||
# deal with ECMA escape characters
|
||||
escape = match.group(1) or match.group(2)
|
||||
return (
|
||||
Rf'\{escape}'
|
||||
if escape in _JSON_PASSTHROUGH_ESCAPES
|
||||
else R'\u00'
|
||||
if escape == 'x'
|
||||
else ''
|
||||
if escape == '\n'
|
||||
else escape
|
||||
)
|
||||
|
||||
|
||||
def _j2p_decimal(match):
|
||||
return (
|
||||
match.group(1)
|
||||
+ match.group(2)
|
||||
+ (match.group(3).replace("_", "") or "0")
|
||||
+ "."
|
||||
+ (match.group(4).replace("_", "") or "0")
|
||||
)
|
||||
|
||||
|
||||
def _j2p_decimal2(match):
|
||||
return match.group(1) + match.group(2) + match.group(3).replace("_", "")
|
||||
|
||||
|
||||
def js_variable_to_python(js_variable):
|
||||
"""Convert a javascript variable into JSON and then load the value
|
||||
|
||||
It does not deal with all cases, but it is good enough for now.
|
||||
chompjs has a better implementation.
|
||||
"""
|
||||
if not isinstance(js_variable, str):
|
||||
raise ValueError("js_variable must be of type str")
|
||||
if js_variable == "":
|
||||
raise ValueError("js_variable can't be an empty string")
|
||||
|
||||
# when in_string is not None, it contains the character that has opened the string
|
||||
# either simple quote or double quote
|
||||
in_string = None
|
||||
|
@ -665,49 +703,68 @@ def js_variable_to_python(js_variable):
|
|||
# r"""{ a:"f\"irst", c:'sec"ond'}"""
|
||||
# becomes
|
||||
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
|
||||
parts = re.split(r'(["\'])', js_variable)
|
||||
# previous part (to check the escape character antislash)
|
||||
previous_p = ""
|
||||
parts = _JS_STRING_DELIMITERS.split(js_variable)
|
||||
# does the previous part ends with a backslash?
|
||||
blackslash_just_before = False
|
||||
for i, p in enumerate(parts):
|
||||
# parse characters inside a ECMA string
|
||||
if in_string:
|
||||
# we are in a JS string: replace the colon by a temporary character
|
||||
# so quote_keys_regex doesn't have to deal with colon inside the JS strings
|
||||
parts[i] = parts[i].replace(':', chr(1))
|
||||
if in_string == "'":
|
||||
# the JS string is delimited by simple quote.
|
||||
# This is not supported by JSON.
|
||||
# simple quote delimited string are converted to double quote delimited string
|
||||
# here, inside a JS string, we escape the double quote
|
||||
parts[i] = parts[i].replace('"', r'\"')
|
||||
|
||||
# deal with delimieters and escape character
|
||||
if not in_string and p in ('"', "'"):
|
||||
# we are not in string
|
||||
# but p is double or simple quote
|
||||
# that's the start of a new string
|
||||
# replace simple quote by double quote
|
||||
# (JSON doesn't support simple quote)
|
||||
parts[i] = '"'
|
||||
in_string = p
|
||||
continue
|
||||
if p == in_string:
|
||||
# we are in a string and the current part MAY close the string
|
||||
if len(previous_p) > 0 and previous_p[-1] == '\\':
|
||||
# there is an antislash just before: the ECMA string continue
|
||||
continue
|
||||
# the current p close the string
|
||||
# replace simple quote by double quote
|
||||
parts[i] = '"'
|
||||
if p == in_string and not blackslash_just_before:
|
||||
# * the current part matches the character which has opened the string
|
||||
# * there is no antislash just before
|
||||
# --> the current part close the current string
|
||||
in_string = None
|
||||
# replace simple quote and ` by double quote
|
||||
# since JSON supports only double quote for string
|
||||
parts[i] = '"'
|
||||
|
||||
if not in_string:
|
||||
# replace void 0 by null
|
||||
elif in_string:
|
||||
# --> we are in a JS string
|
||||
# replace the colon by a temporary character
|
||||
# so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings
|
||||
p = p.replace(':', chr(1))
|
||||
# replace JS escape sequences by JSON escape sequences
|
||||
p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p)
|
||||
# the JS string is delimited by simple quote.
|
||||
# This is not supported by JSON.
|
||||
# simple quote delimited string are converted to double quote delimited string
|
||||
# here, inside a JS string, we escape the double quote
|
||||
if in_string == "'":
|
||||
p = p.replace('"', r'\"')
|
||||
parts[i] = p
|
||||
# deal with the sequence blackslash then quote
|
||||
# since js_variable splits on quote, we detect this case:
|
||||
# * the previous part ends with a black slash
|
||||
# * the current part is a single quote
|
||||
# when detected the blackslash is removed on the previous part
|
||||
if blackslash_just_before and p[:1] == "'":
|
||||
parts[i - 1] = parts[i - 1][:-1]
|
||||
|
||||
elif in_string is None and p in ('"', "'", "`"):
|
||||
# we are not in string but p is string delimiter
|
||||
# --> that's the start of a new string
|
||||
in_string = p
|
||||
# replace simple quote by double quote
|
||||
# since JSON supports only double quote for string
|
||||
parts[i] = '"'
|
||||
|
||||
elif in_string is None:
|
||||
# we are not in a string
|
||||
# replace by null these values:
|
||||
# * void 0
|
||||
# * void(0)
|
||||
# * undefined
|
||||
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
|
||||
# we are sure there is no string in p
|
||||
parts[i] = _JS_VOID_RE.sub("null", p)
|
||||
# update previous_p
|
||||
previous_p = p
|
||||
p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p)
|
||||
# make sure there is a leading zero in front of float
|
||||
p = _JS_DECIMAL_RE.sub(_j2p_decimal, p)
|
||||
p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p)
|
||||
# remove extra coma in a list or an object
|
||||
# for example [1,2,3,] becomes [1,2,3]
|
||||
p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p)
|
||||
parts[i] = p
|
||||
|
||||
# update for the next iteration
|
||||
blackslash_just_before = len(p) > 0 and p[-1] == '\\'
|
||||
|
||||
# join the string
|
||||
s = ''.join(parts)
|
||||
# add quote arround the key
|
||||
|
@ -715,8 +772,13 @@ def js_variable_to_python(js_variable):
|
|||
# becomes
|
||||
# { "a": 12 }
|
||||
s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
|
||||
s = _JS_DECIMAL_RE.sub(":0.", s)
|
||||
# replace the surogate character by colon
|
||||
s = s.replace(chr(1), ':')
|
||||
# replace the surogate character by colon and strip whitespaces
|
||||
s = s.replace(chr(1), ':').strip()
|
||||
# load the JSON and return the result
|
||||
return json.loads(s)
|
||||
if s == "":
|
||||
raise ValueError("js_variable can't be an empty string")
|
||||
try:
|
||||
return json.loads(s)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug("Internal error: js_variable_to_python creates invalid JSON:\n%s", s)
|
||||
raise ValueError("js_variable_to_python creates invalid JSON") from e
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue