Skip to content

Commit c3dcd1c

Browse files
authored
Update preprocess_data.py
1 parent 872d45a commit c3dcd1c

1 file changed

Lines changed: 80 additions & 0 deletions

File tree

preprocess_data.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,81 @@
1+
import json
2+
import tiktoken
3+
from tqdm import tqdm
4+
5+
MAX_TOKENS = 50000
6+
7+
def count_tokens(text: str) -> int:
8+
enc = tiktoken.get_encoding("cl100k_base")
9+
return len(enc.encode(text))
10+
11+
def prune_fields(item: dict) -> dict:
12+
pruned = {
13+
"event_type": item.get("event_type"),
14+
"time": item.get("time")
15+
}
16+
17+
process = item.get("process", {})
18+
pruned["process"] = {
19+
"signing_id": process.get("signing_id"),
20+
"cdhash": process.get("cdhash"),
21+
"team_id": process.get("team_id"),
22+
"is_platform_binary": process.get("is_platform_binary"),
23+
"executable_path": process.get("executable", {}).get("path"),
24+
"start_time": process.get("start_time"),
25+
"ppid": process.get("ppid"),
26+
"euid": process.get("audit_token", {}).get("euid")
27+
}
28+
29+
event = item.get("event", {})
30+
if "create" in event:
31+
path = event["create"].get("destination", {}).get("existing_file", {}).get("path")
32+
pruned["event"] = {"create": {"destination_path": path}}
33+
elif "rename" in event:
34+
src = event["rename"].get("source", {}).get("path")
35+
dst = event["rename"].get("destination", {}).get("existing_file", {}).get("path")
36+
pruned["event"] = {"rename": {"source_path": src, "destination_path": dst}}
37+
38+
return pruned
39+
40+
def truncate_json_by_accumulation(input_path: str, output_path: str) -> str:
41+
with open(input_path, "r", encoding="utf-8") as f:
42+
data = json.load(f)
43+
44+
if not isinstance(data, list):
45+
raise TypeError(f"The top-level JSON in {input_path} is not a list.")
46+
47+
data = list(reversed(data))
48+
49+
result = []
50+
token_total = 0
51+
52+
with tqdm(total=len(data), desc="Pruning and truncating JSON") as pbar:
53+
for item in data:
54+
pruned = prune_fields(item)
55+
item_str = json.dumps(pruned, separators=(',', ':'))
56+
item_tokens = count_tokens(item_str)
57+
58+
if token_total + item_tokens > MAX_TOKENS:
59+
break
60+
61+
result.append(pruned)
62+
token_total += item_tokens
63+
pbar.update(1)
64+
65+
result = list(reversed(result))
66+
67+
with open(output_path, "w", encoding="utf-8") as f:
68+
json.dump(result, f, separators=(',', ':'))
69+
70+
print(f"\nTruncated file written to: {output_path}")
71+
print(f"ℹFinal size: {len(result)} entries, {token_total} tokens.")
72+
return output_path
73+
74+
if __name__ == "__main__":
75+
input_file = "/Users/martinativadar/Desktop/masterLLM/malware/keySteal.json"
76+
output_file = "keySteal_truncated_last_filtered_short.json"
77+
try:
78+
truncate_json_by_accumulation(input_file, output_file)
79+
except Exception as e:
80+
print(f"Error: {e}")
181

0 commit comments

Comments
 (0)