-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathevaluate.py
More file actions
119 lines (93 loc) · 4.15 KB
/
evaluate.py
File metadata and controls
119 lines (93 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
import os
from config import VALID_LABELS, DATA_PATH, TEST_FILE
from classifier import classify_episode, load_labeled_examples
def run_evaluation() -> dict:
"""
Run the classifier against the held-out test set and return full results.
This function is already complete. It:
1. Loads the labeled training examples (from your my_labels.json)
2. Loads the test episodes (with ground-truth labels)
3. Runs classify_episode() on each test description
4. Returns a results dict with predictions, ground truth, and per-episode detail
You'll use the results dict in compute_accuracy() and compute_per_class_accuracy().
"""
labeled_examples = load_labeled_examples()
test_path = os.path.join(DATA_PATH, TEST_FILE)
with open(test_path, encoding="utf-8") as f:
test_episodes = json.load(f)
results = []
for episode in test_episodes:
print(f" Classifying: {episode['title'][:60]}...")
prediction = classify_episode(episode["description"], labeled_examples)
results.append({
"id": episode["id"],
"title": episode["title"],
"description": episode["description"],
"ground_truth": episode["label"],
"predicted": prediction["label"],
"reasoning": prediction["reasoning"],
"correct": prediction["label"] == episode["label"],
})
predictions = [r["predicted"] for r in results]
ground_truth = [r["ground_truth"] for r in results]
return {
"results": results,
"predictions": predictions,
"ground_truth": ground_truth,
"total": len(results),
}
def compute_accuracy(predictions: list[str], ground_truth: list[str]) -> float:
"""
Compute overall classification accuracy.
TODO — Milestone 3:
Accuracy = number of correct predictions / total predictions.
A prediction is correct when it exactly matches the ground truth label.
Before writing code, complete specs/evaluation-spec.md.
"""
return 0.0
def compute_per_class_accuracy(
predictions: list[str], ground_truth: list[str]
) -> dict[str, dict]:
"""
Compute accuracy broken down by each label class.
TODO — Milestone 3 (complete after compute_accuracy):
For each label in VALID_LABELS, compute:
- "correct" : number of episodes with this ground-truth label predicted correctly
- "total" : number of episodes with this ground-truth label
- "accuracy" : correct / total (0.0 if total is 0)
Return a dict keyed by label. Example:
{
"interview": {"correct": 4, "total": 5, "accuracy": 0.8},
"solo": {"correct": 5, "total": 5, "accuracy": 1.0},
...
}
Before writing code, complete specs/evaluation-spec.md.
"""
return {label: {"correct": 0, "total": 0, "accuracy": 0.0} for label in VALID_LABELS}
def format_evaluation_report(eval_results: dict) -> str:
"""
Format evaluation results into a readable report string.
This function is already complete. Pass it the dict returned by run_evaluation().
"""
predictions = eval_results["predictions"]
ground_truth = eval_results["ground_truth"]
results = eval_results["results"]
accuracy = compute_accuracy(predictions, ground_truth)
per_class = compute_per_class_accuracy(predictions, ground_truth)
lines = [
f"## Evaluation Results\n",
f"**Overall accuracy:** {accuracy:.1%} ({sum(r['correct'] for r in results)}/{eval_results['total']})\n",
"\n**Per-class accuracy:**",
]
for label, stats in per_class.items():
bar = "█" * int(stats["accuracy"] * 10) + "░" * (10 - int(stats["accuracy"] * 10))
lines.append(f" {label:<12} {bar} {stats['accuracy']:.0%} ({stats['correct']}/{stats['total']})")
misclassified = [r for r in results if not r["correct"]]
if misclassified:
lines.append(f"\n**Misclassified ({len(misclassified)}):**")
for r in misclassified:
lines.append(f" [{r['ground_truth']} → {r['predicted']}] {r['title']}")
else:
lines.append("\n**No misclassifications — perfect score!**")
return "\n".join(lines)