-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_results.py
More file actions
93 lines (74 loc) · 3.02 KB
/
Copy pathparse_results.py
File metadata and controls
93 lines (74 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import argparse
import json
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
def extract_text_from_response(response_obj):
if isinstance(response_obj, str):
return response_obj
text = ""
for candidate in response_obj.get("candidates", []):
for part in candidate.get("content", {}).get("parts", []):
if "text" in part:
text += part["text"]
return text
def parse_edit_instruction_results(results_file, instances_dir, output_dir):
with open(results_file) as f:
results = {}
for line in f:
if not line.strip():
continue
entry = json.loads(line)
results[entry["key"]] = entry
instances_dir = Path(instances_dir)
inst_map = {}
for json_file in instances_dir.glob("*.json"):
with open(json_file) as f:
inst = json.load(f)
inst_map[inst["id"]] = inst
output_dir = Path(output_dir)
count = 0
for key, entry in results.items():
parts = key.split("__")
if len(parts) != 3:
logger.warning("Unexpected key: %s", key)
continue
category, subcategory, inst_id = parts
if inst_id not in inst_map:
logger.warning("Instance not found: %s", inst_id)
continue
response = entry.get("response", "")
text = extract_text_from_response(response) if isinstance(response, dict) else response
if not text:
logger.warning("Empty response for %s", key)
continue
instruction = text.strip().rstrip(".")
if len(instruction.split()) < 3:
logger.warning("Instruction too short for %s: %s", key, instruction)
continue
inst = inst_map[inst_id]
output_json = {
"id": inst_id,
"data": {"prompt": inst["data"]["prompt"]},
"type": inst.get("type", ""),
"difficulty": inst.get("difficulty", ""),
"category": category,
"subcategory": subcategory,
"edit_instruction": instruction,
}
out_path = output_dir / category / subcategory
out_path.mkdir(parents=True, exist_ok=True)
with open(out_path / f"{inst_id}.json", "w") as f:
json.dump(output_json, f, indent=4)
count += 1
logger.info("Parsed %d edit instructions", count)
def main():
parser = argparse.ArgumentParser(description="Parse batch results into edit instruction JSONs")
parser.add_argument("--results_file", required=True, help="Path to results JSONL")
parser.add_argument("--instances_dir", required=True, help="Directory with source instance JSONs")
parser.add_argument("--output_dir", required=True, help="Output directory for edit instruction JSONs")
args = parser.parse_args()
parse_edit_instruction_results(args.results_file, args.instances_dir, args.output_dir)
if __name__ == "__main__":
main()