Skip to content

Commit 9f55a56

Browse files
committed
refactor(eval): use streaming dataset loading for AIME
1 parent 87c9c7c commit 9f55a56

File tree

1 file changed

+10
-16
lines changed

1 file changed

+10
-16
lines changed

src/strands_env/eval/aime.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,32 +38,26 @@ class AIMEEvaluator(Evaluator):
3838

3939
@override
4040
def load_dataset(self) -> Iterable[Action]:
41-
"""Load AIME dataset from HuggingFace.
41+
"""Load AIME dataset from HuggingFace (streaming).
4242
43-
Returns:
44-
Iterable of Action objects with problem text and ground truth.
43+
Yields:
44+
Action objects with problem text and ground truth.
4545
"""
46-
dataset = load_dataset(self.dataset_path, split="train")
46+
dataset = load_dataset(self.dataset_path, split="train", streaming=True)
4747

48-
actions = []
4948
for i, row in enumerate(dataset):
5049
problem, answer = row.get("problem"), row.get("answer")
5150
if problem is None or answer is None:
5251
logger.warning(f"Row {i}: missing problem/answer, skipped")
5352
continue
54-
actions.append(
55-
Action(
56-
message=str(problem),
57-
task_context=TaskContext(
58-
id=f"{self.benchmark_name}_{row.get('id', i)}",
59-
ground_truth=str(answer),
60-
),
61-
)
53+
yield Action(
54+
message=str(problem),
55+
task_context=TaskContext(
56+
id=f"{self.benchmark_name}_{row.get('id', i)}",
57+
ground_truth=str(answer),
58+
),
6259
)
6360

64-
logger.info(f"[{self.benchmark_name}] Loaded {len(actions)}/{len(dataset)} prompts")
65-
return actions
66-
6761

6862
@register("aime-2024")
6963
class AIME2024Evaluator(AIMEEvaluator):

0 commit comments

Comments
 (0)