-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmerge_results.py
More file actions
executable file
·166 lines (135 loc) · 4.55 KB
/
Copy pathmerge_results.py
File metadata and controls
executable file
·166 lines (135 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
"""Merge worker logs into consolidated results with optional de-dupe."""
from __future__ import annotations
import argparse
import json
import os
import time
from typing import Dict, Iterable, Optional, Set
def _load_state(path: str) -> Dict[str, int]:
if not os.path.exists(path):
return {}
try:
with open(path, "r", encoding="utf-8") as handle:
return json.load(handle)
except json.JSONDecodeError:
return {}
def _save_state(path: str, state: Dict[str, int]) -> None:
with open(path, "w", encoding="utf-8") as handle:
json.dump(state, handle, indent=2)
def _grid_key(grid: object) -> Optional[str]:
if grid is None:
return None
try:
return json.dumps(grid, sort_keys=True)
except TypeError:
return None
def _load_seen(path: str) -> Set[str]:
seen: Set[str] = set()
if not os.path.exists(path):
return seen
with open(path, "r", encoding="utf-8") as handle:
for line in handle:
if not line.strip():
continue
try:
data = json.loads(line)
except json.JSONDecodeError:
continue
grid = data.get("grid") or data.get("best_square")
key = _grid_key(grid)
if key:
seen.add(key)
return seen
def _merge_file(
*,
source_path: str,
dest_path: str,
state: Dict[str, int],
dedupe: bool = False,
seen: Optional[Set[str]] = None,
) -> int:
if not os.path.exists(source_path):
return 0
offset = state.get(source_path, 0)
merged = 0
with open(source_path, "r", encoding="utf-8") as source, open(
dest_path, "a", encoding="utf-8"
) as dest:
source.seek(offset)
while True:
line = source.readline()
if not line:
break
if not line.strip():
offset = source.tell()
continue
if dedupe and seen is not None:
try:
data = json.loads(line)
except json.JSONDecodeError:
offset = source.tell()
continue
grid = data.get("grid") or data.get("best_square")
key = _grid_key(grid)
if key and key in seen:
offset = source.tell()
continue
if key:
seen.add(key)
dest.write(line)
merged += 1
else:
dest.write(line)
merged += 1
offset = source.tell()
state[source_path] = offset
return merged
def _worker_files(results_dir: str, prefix: str) -> Iterable[str]:
for name in os.listdir(results_dir):
if name.startswith(prefix) and name.endswith(".jsonl"):
yield os.path.join(results_dir, name)
def merge_once(results_dir: str, dedupe_near: bool) -> None:
os.makedirs(results_dir, exist_ok=True)
state_path = os.path.join(results_dir, "merge_state.json")
state = _load_state(state_path)
experiments_path = os.path.join(results_dir, "experiments.jsonl")
near_path = os.path.join(results_dir, "near_misses.jsonl")
exact_path = os.path.join(results_dir, "exact_hits.jsonl")
seen_near = _load_seen(near_path) if dedupe_near else None
for path in _worker_files(results_dir, "worker_"):
_merge_file(
source_path=path,
dest_path=experiments_path,
state=state,
)
for path in _worker_files(results_dir, "near_misses_"):
_merge_file(
source_path=path,
dest_path=near_path,
state=state,
dedupe=dedupe_near,
seen=seen_near,
)
for path in _worker_files(results_dir, "exact_hits_"):
_merge_file(
source_path=path,
dest_path=exact_path,
state=state,
)
_save_state(state_path, state)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--results-dir", default="results")
parser.add_argument("--interval", type=float, default=300.0)
parser.add_argument("--loop", action="store_true")
parser.add_argument("--no-dedupe-near", action="store_true")
args = parser.parse_args()
if not args.loop:
merge_once(args.results_dir, not args.no_dedupe_near)
return
while True:
merge_once(args.results_dir, not args.no_dedupe_near)
time.sleep(args.interval)
if __name__ == "__main__":
main()