-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoutput.py
195 lines (172 loc) · 6.02 KB
/
output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from dataclasses import dataclass
from typing import Callable, List
from tabulate import tabulate
from scorer import GameResults, RoundResult
@dataclass
class ModelStats:
model_name: str
total_score: int
score_percentage: float
avg_score_per_game: float
median_distance_km: float
min_distance_km: float
max_distance_km: float
@classmethod
def from_rounds(
cls,
model_name: str,
rounds: List[RoundResult],
score_fn: Callable[[RoundResult], int],
distance_fn: Callable[[RoundResult], float],
total_games: int,
) -> "ModelStats":
"""Create stats from a list of rounds."""
scores = [score_fn(r) for r in rounds]
distances = [distance_fn(r) for r in rounds]
total_score = sum(scores)
total_possible = len(rounds) * 5000
return cls(
model_name=model_name,
total_score=total_score,
score_percentage=(total_score / total_possible) * 100,
avg_score_per_game=total_score / total_games,
median_distance_km=sorted(distances)[len(distances) // 2],
min_distance_km=min(distances),
max_distance_km=max(distances),
)
def to_table_row(self) -> List[str]:
"""Convert stats to a table row for printing."""
return [
self.model_name,
f"{self.score_percentage:.1f}%",
f"{self.total_score:,d}",
f"{self.median_distance_km:,.1f}",
f"{self.min_distance_km:,.1f}",
f"{self.max_distance_km:,.1f}",
]
@dataclass
class AggregateModelStats(ModelStats):
"""Statistics for a model's performance across multiple games."""
max_game_score: int
min_game_score: int
@classmethod
def from_rounds(
cls,
model_name: str,
rounds: List[RoundResult],
score_fn: Callable[[RoundResult], int],
distance_fn: Callable[[RoundResult], float],
total_games: int,
) -> "AggregateModelStats":
"""Create aggregate stats from a list of rounds."""
scores = [score_fn(r) for r in rounds]
distances = [distance_fn(r) for r in rounds]
total_score = sum(scores)
total_possible = len(rounds) * 5000
game_scores = [sum(scores[i : i + 5]) for i in range(0, len(scores), 5)]
return cls(
model_name=model_name,
total_score=total_score,
score_percentage=(total_score / total_possible) * 100,
avg_score_per_game=total_score / total_games,
median_distance_km=sorted(distances)[len(distances) // 2],
min_distance_km=min(distances),
max_distance_km=max(distances),
max_game_score=max(game_scores),
min_game_score=min(game_scores),
)
def to_table_row(self) -> List[str]:
"""Convert aggregate stats to a table row for printing."""
return [
self.model_name,
f"{self.score_percentage:.1f}%",
f"{self.avg_score_per_game:,.1f}",
f"{self.max_game_score:,d}",
f"{self.min_game_score:,d}",
f"{self.median_distance_km:,.1f}",
f"{self.min_distance_km:,.1f}",
f"{self.max_distance_km:,.1f}",
]
def print_round_results(round_result: RoundResult) -> None:
"""Print the results of a single round."""
lines = [
f"\n=== Round {round_result.round_number} Results ===\n",
f"Actual Location: ({round_result.actual_location.lat:.4f}, {round_result.actual_location.lng:.4f})",
"\nGPT-4o:",
f" Score: {round_result.gpt4o_guess.score:,d}",
f" Distance: {round_result.gpt4o_guess.distance_km:.1f} km",
f" Explanation: {round_result.gpt4o_guess.explanation}",
"\no1:",
f" Score: {round_result.o1_guess.score:,d}",
f" Distance: {round_result.o1_guess.distance_km:.1f} km",
f" Explanation: {round_result.o1_guess.explanation}",
"\n==================\n",
]
print("\n".join(lines))
def print_game_results(game_results: GameResults) -> None:
"""Print the results of a complete game."""
gpt4o = ModelStats.from_rounds(
"GPT-4o",
game_results.rounds,
lambda r: r.gpt4o_guess.score,
lambda r: r.gpt4o_guess.distance_km,
total_games=1,
)
o1 = ModelStats.from_rounds(
"o1",
game_results.rounds,
lambda r: r.o1_guess.score,
lambda r: r.o1_guess.distance_km,
total_games=1,
)
data = [
[
"Model",
"Score %",
"Avg Score/Game (/25,000)",
"Median Distance (km)",
"Best Guess (km)",
"Worst Guess (km)",
],
gpt4o.to_table_row(),
o1.to_table_row(),
]
print("\n=== Final Results ===\n")
print(tabulate(data, headers="firstrow", tablefmt="github"))
print("\n==================\n")
def print_aggregate_results(all_games: List[GameResults]) -> None:
"""Print aggregate statistics across all games."""
total_games = len(all_games)
all_rounds = [r for game in all_games for r in game.rounds]
# Get stats for each model
gpt4o = AggregateModelStats.from_rounds(
"GPT-4o",
all_rounds,
lambda r: r.gpt4o_guess.score,
lambda r: r.gpt4o_guess.distance_km,
total_games=total_games,
)
o1 = AggregateModelStats.from_rounds(
"o1",
all_rounds,
lambda r: r.o1_guess.score,
lambda r: r.o1_guess.distance_km,
total_games=total_games,
)
data = [
[
"Model",
"Score %",
"Avg Score/Game (/25,000)",
"Best Game (/25,000)",
"Worst Game (/25,000)",
"Median Distance (km)",
"Best Guess (km)",
"Worst Guess (km)",
],
gpt4o.to_table_row(),
o1.to_table_row(),
]
print("\n=== Aggregate Results ===\n")
print(tabulate(data, headers="firstrow", tablefmt="github"))
print("\n==================\n")