Skip to content

Commit

Permalink
optimize SequenceExpression feature extraction
Browse files Browse the repository at this point in the history
* fix: searching operation def list (#58)

* bump versions

* fix(stxhash): optimize for SequenceExpression

* bump version

---------

Co-authored-by: JamzumSum <[email protected]>
  • Loading branch information
github-actions[bot] and JamzumSum authored Nov 1, 2024
1 parent 4fe6e16 commit 5202a4b
Show file tree
Hide file tree
Showing 7 changed files with 237 additions and 170 deletions.
282 changes: 133 additions & 149 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pychaosvm"
version = "0.4.0"
version = "0.4.1"
description = "A Python envirionment for Tencent ChaosVM."
authors = ["aioqzone <[email protected]>"]
license = "AGPL-3.0-or-later"
Expand All @@ -14,7 +14,7 @@ typing-extensions = ">=4.6.0"
lxml = "^5.3.0"

[tool.poetry.group.test.dependencies]
pytest = "^7.4.2"
pytest = "^8.2.1"

[tool.poetry.group.dev]
optional = true
Expand Down
44 changes: 37 additions & 7 deletions src/chaosvm/parse.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from base64 import b64decode
from collections import defaultdict
from hashlib import md5
from typing import Any, Callable, Dict, Iterable, List, Union
from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
from urllib.parse import unquote

import pyjsparser as jsparser
Expand All @@ -19,6 +19,17 @@ def path_get(d: Union[dict, list], *path: Union[str, int]) -> Any:
return o


def path_get_default(d: Union[dict, list], *path: Union[str, int], default=None) -> Any:
o = d
for i in path:
if (isinstance(o, dict) and i not in o) or (
isinstance(o, list) and i >= len(o) # type: ignore
):
return default
o = o[i] # type: ignore
return o


def first(pred: Callable, it: Iterable):
return next(filter(pred, it))

Expand Down Expand Up @@ -64,16 +75,35 @@ def parse_vm(vm_js: str, window: Window):
return stack


declare_parsers = [
lambda dcl_content: [
i
for d in dcl_content
if d["type"] == "VariableDeclaration"
and (i := path_get_default(d, "declarations", 0, "init"))
],
lambda dcl_content: [
i
for d in dcl_content
if d["type"] == "ForStatement"
and (i := path_get_default(d, "init", "declarations", 0, "init"))
],
]


def try_get_declare_contents(vm_declare: dict) -> Tuple[int, List[dict]]:
for ver, f in enumerate(declare_parsers):
if dcl := f(vm_declare):
return ver, dcl
raise NotImplementedError("This version is not tested...")


def parse_opcode_mapping(vm_declare: dict) -> Dict[int, int]:
"""Parse operation-code mapping."""
params = vm_declare["params"]
G = {i["name"]: k for i, k in zip(params, ["p", "P", "window", "S"])}
dcl_content = path_get(vm_declare, "body", "body")
declares = [
i
for d in dcl_content
if d["type"] == "VariableDeclaration" and (i := path_get(d, "declarations", 0, "init"))
]

version, declares = try_get_declare_contents(path_get(vm_declare, "body", "body"))
op_def_list = first(lambda i: i["type"] == "ArrayExpression", declares)["elements"]

d: Dict[int, int] = {}
Expand Down
22 changes: 15 additions & 7 deletions src/chaosvm/stxhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,25 @@


def _syntax_hash(node: dict, context: defaultdict, d=";"):
def _variable_declarator() -> str:
id_hash = syntax_hash(node["id"], context)
if node["init"]:
if node["init"]["type"] == "SequenceExpression":
return (
syntax_hash(node["init"]["expressions"][:-1], context)
+ d
+ f"{id_hash}={syntax_hash(node['init']['expressions'][-1], context)}"
)
return f"{id_hash}={syntax_hash(node['init'], context)}"
return ""

cases = dict(
Literal=lambda: defaultdict(lambda: repr(literal_eval(c)), dict(null="null"))[
(c := node["raw"])
],
Identifier=lambda: context[c] if len(c := node["name"]) == 1 else c,
VariableDeclaration=lambda: f"{node['kind']} {syntax_hash(node['declarations'], context, ',')}",
VariableDeclarator=lambda: (
f"{syntax_hash(node['id'], context)}={syntax_hash(node['init'], context)}"
if node["init"]
else syntax_hash(node["id"], context)
),
VariableDeclaration=lambda: f"{syntax_hash(node['declarations'], context)}",
VariableDeclarator=_variable_declarator,
AssignmentExpression=lambda: f"{syntax_hash(node['left'], context)}"
f"{node['operator']}{syntax_hash(node['right'], context)}",
UnaryExpression=lambda: f"{node['operator']}{syntax_hash(node['argument'], context)}",
Expand All @@ -27,7 +35,7 @@ def _syntax_hash(node: dict, context: defaultdict, d=";"):
MemberExpression=lambda: f"{syntax_hash(node['object'], context)}"
f"[{syntax_hash(node['property'], context)}]",
ExpressionStatement=lambda: syntax_hash(node["expression"], context),
SequenceExpression=lambda: syntax_hash(node["expressions"], context, ","),
SequenceExpression=lambda: syntax_hash(node["expressions"], context),
ForStatement=lambda: "for",
ForInStatement=lambda: "for in",
ConditionalExpression=lambda: f"{syntax_hash(node['test'], context)}?"
Expand Down
3 changes: 1 addition & 2 deletions src/chaosvm/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
if TYPE_CHECKING:
from .proxy.dom import Window


# If we update syntax feature extractor, we can just update md5 here.
# fmt: off
OP_FEATS = ('5ceb04a17d2ccd243a3cd8d43d58412f','2c64a078cb8c4b856fdc70a609852c84','22baa62b15474dc170105ea16907be4f','a0d2ef60799df6195af8233faf1d4405','821662fd6eed2bc7baf4ec9cf305ed3d','86bfa469c728aef498dc0b31acca50d5','a171259d3583f1d528c527cca37181c6','2e457be74b78687bda17467657427c44','36daeb76f0369182d47bc0854cd62f3e','7861d746f3115dc52985788bad85f9f4','d5582f0d77825e3dd4b5de1b58c4367c','cad016c2b4b99c28c26ab19975ee0ed9','85aeeab3938f54b19b45f3e95802c185','46be5ad0b74da7c1025e229ee1b86443','ba98404956c3877209b59858a84090e9','f117180b06547c4efbcb2bd2b2164849','19d1047281ae4901d0e08885458ceb5a','e6803eb42dc05fc3e04283902865287c','0f935762ce5225379c0f4b8b20698026','854175af0e5ea31a14afd3b34a8faa80','2732918292df330ac7462015dff8969c','d378d1594b18890e237b5d472818e309','26df6ca6775d9d0d1b524e4fe7ef1d51','35bbb1a74b0380e46a199abe999bf303','a8ed98953190027b3dad5ccb0f3f73be','c2b8e8732ecf925e116f1017a4fcfebf','acaa0c50323b6fd6e8b9b9395f4ad30b','9557e2616caac44899f6612e32fa5cd2','9a3f40351dbad181dc027c596f23df4c','021111bd795ea2b9b7e44275fcda3fe5','728702d0440f2d3a5c425d736fd6b2a6','dacd0c2abe15333ad9d5aaf9e550da71','7211294be669b58b0f3da4940a35dcce','fb632ca1b5f01438ecbb31c2560a78d8','a14cc4c1bd40951d1052c2c4c8353d13','18f2d14a9d67ef3504777a3be8ff7532','ac70343d82c97644522ed31a98649989','e41fa5e46c2d94d4d7b54437e71f5862','9c7676e1872be2fb9bf02aaefa78e066','a9e27183565a9854cf6e593b2572beec','4509710e44dc7c0bae5b39ee74b188c5','57270c2716f715468eaf0429965cf123','61663d46238a47351f4ff7e24326360c','3b20fb198a1f87da243bf27aadb19805','9c28d03d5a01e0360e830168b47ec0da','2d1bb184a9a54c223b38ac23340bdd23','1691f2ef2945d750f686ceefda8ee5be','a7c235198def717b198ceb39d993ede9','80db3dff6284dfb62b88c7629af22afd','d2d4c0d054580286a463d79d0881644a','e66f61b8e3792cb44c2ae0be71173d45','0bbd3879b0867fa76722b7ca001cb338','96d30e9496fccd6a9ddcf45a35316e45','af29f37ff067adb9398e5b9b42b8f7b7','50cd82d43ac8eaa4ff4017509272f65b','c00fc6652cacebbf04dc3958a058150c','2598bc9255deafbb48adf287d5d3b12a','13274e03e106918b096bc5fd4c5423ba')
OP_FEATS = ('2a6add49d812b23c1518c5b129b945f8', '2c64a078cb8c4b856fdc70a609852c84', '22baa62b15474dc170105ea16907be4f', 'a0d2ef60799df6195af8233faf1d4405', '821662fd6eed2bc7baf4ec9cf305ed3d', '86bfa469c728aef498dc0b31acca50d5', 'a171259d3583f1d528c527cca37181c6', '2e457be74b78687bda17467657427c44', '36daeb76f0369182d47bc0854cd62f3e', '7861d746f3115dc52985788bad85f9f4', 'd5582f0d77825e3dd4b5de1b58c4367c', 'cad016c2b4b99c28c26ab19975ee0ed9', '85aeeab3938f54b19b45f3e95802c185', '46be5ad0b74da7c1025e229ee1b86443', 'e6b094bc7d2c7092193723e0765c5848', 'a3dff4b562fdd065d4deace770c3d90e', '19d1047281ae4901d0e08885458ceb5a', 'e6803eb42dc05fc3e04283902865287c', '830b7a2b9fc1439b0358a00b20f229b2', '854175af0e5ea31a14afd3b34a8faa80', '2732918292df330ac7462015dff8969c', 'd378d1594b18890e237b5d472818e309', 'c0d603b7a66da4e8d47907a2c11d4e7a', '35bbb1a74b0380e46a199abe999bf303', '517f0463eba56f88b271acfbdda3c2f7', 'c2b8e8732ecf925e116f1017a4fcfebf', 'acaa0c50323b6fd6e8b9b9395f4ad30b', '9557e2616caac44899f6612e32fa5cd2', '9a3f40351dbad181dc027c596f23df4c', '021111bd795ea2b9b7e44275fcda3fe5', '2e98536e65b245063c106e192a0fe58e', 'dacd0c2abe15333ad9d5aaf9e550da71', '7211294be669b58b0f3da4940a35dcce', 'fb632ca1b5f01438ecbb31c2560a78d8', 'a14cc4c1bd40951d1052c2c4c8353d13', '1cc5222162165718596063a8d52e9df0', 'ac70343d82c97644522ed31a98649989', 'e41fa5e46c2d94d4d7b54437e71f5862', '9c7676e1872be2fb9bf02aaefa78e066', 'a62dbee713cff9689e8235aac0cb553e', '4509710e44dc7c0bae5b39ee74b188c5', '57270c2716f715468eaf0429965cf123', '61663d46238a47351f4ff7e24326360c', '3b20fb198a1f87da243bf27aadb19805', '9c28d03d5a01e0360e830168b47ec0da', 'f214f2e0367778d237dcfcbfbd0a7c91', '42e24082bd8f6b2d3e1e6c5dfb01f6f3', 'a7c235198def717b198ceb39d993ede9', '80db3dff6284dfb62b88c7629af22afd', 'd2d4c0d054580286a463d79d0881644a', 'e66f61b8e3792cb44c2ae0be71173d45', '0bbd3879b0867fa76722b7ca001cb338', '09a2a080abcdc4d360f6bc5bd05d8639', 'af29f37ff067adb9398e5b9b42b8f7b7', '65dbf5af2b9be42a9dbec72a06357f50', 'c00fc6652cacebbf04dc3958a058150c', '2598bc9255deafbb48adf287d5d3b12a', '13274e03e106918b096bc5fd4c5423ba')
# fmt: on


Expand Down
40 changes: 40 additions & 0 deletions tests/test_feat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from collections import defaultdict

from pyjsparser import parse

from chaosvm.stxhash import syntax_hash


def test_tuple():
G1 = dict(C="p", y="P", Q="window", H="S")
G2 = dict(k="p", B="P", Q="window", Y="S")
s1 = """
var B = y[C++]
, g = B ? H.slice(-B) : []
, B = (H.length -= B,
g.unshift(null),
H.pop());
H.push(A(B[0][B[1]], g))
"""
s2 = """
var R = B[k++]
, F = R ? Y.slice(-R) : [];
Y.length -= R,
F.unshift(null);
R = Y.pop();
Y.push(A(R[0][R[1]], F))
"""
ast1 = parse(s1)
ast2 = parse(s2)

assert isinstance(ast1, dict)
assert isinstance(ast2, dict)

c = defaultdict(lambda: f"t{len(c)-4}", G1)
f1 = syntax_hash(ast1["body"], c)

c = defaultdict(lambda: f"t{len(c)-4}", G2)
f2 = syntax_hash(ast2["body"], c)

assert f1
assert f1 == f2
12 changes: 9 additions & 3 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@
from pytest import fixture


@fixture(scope="module")
def vmjs() -> str:
with get("https://t.captcha.qq.com/tdc.js?app_data=7124050803564679168&t=636313065") as r:
@fixture(
scope="module",
params=[
"https://t.captcha.qq.com/tdc.js?app_data=7124050803564679168&t=636313065",
"https://turing.captcha.qcloud.com/tdc.js?app_data=7256590633187913728",
],
)
def vmjs(request) -> str:
with get(request.param) as r:
return decompress(r.read()).decode()


Expand Down

0 comments on commit 5202a4b

Please sign in to comment.