Skip to content

Commit 69b2dbe

Browse files
authored
feat: add OLMES variant of HumanEval task (#185)
## PR Checklist - [X] Use descriptive commit messages. - [X] Provide tests for your changes. - [X] Update any related documentation and include any relevant screenshots. - [X] Check if changes need to be made to docs (README or any guides in `/docs/`). ## What type of PR is this? (check all applicable) - [ ] Refactor - [X] Feature - [ ] Bug Fix - [ ] Optimization - [X] Documentation Update ## Description Adds the OLMES variant of the HumanEval ## Added/updated tests? - [X] Yes - [ ] No, and this is why: _please replace this line with details on why tests have not been included_ - [ ] I need help with writing tests
1 parent 93b4f8a commit 69b2dbe

File tree

7 files changed

+92
-2
lines changed

7 files changed

+92
-2
lines changed

docs/tasks/HumanEval_OLMES.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# HumanEval_OLMES
2+
3+
````
4+
NAME = HumanEval_OLMES
5+
DATASET_PATH = openai/openai_humaneval
6+
SAMPLE_SPLIT = test
7+
FEWSHOT_SPLIT = test
8+
RESPONSE_TYPE = COMPLETION
9+
METRICS = [CodeCompletionAssertion]
10+
SUBJECTS = ['no_subject']
11+
LANGUAGE = <Language.ENG: 'English'>
12+
````
13+
14+
- Module: `eval_framework.tasks.benchmarks.humaneval`
15+
16+
- File: [src/eval_framework/tasks/benchmarks/humaneval.py](../../src/eval_framework/tasks/benchmarks/humaneval.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/humaneval.py)
17+
18+
- Link to dataset: [https://huggingface.co/datasets/openai/openai_humaneval](https://huggingface.co/datasets/openai/openai_humaneval)
19+
20+
More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HumanEval_OLMES"`.

docs/tasks/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
This directory contains the generated documentation for all benchmark tasks available in the package.
44

5-
**Total number of tasks: 157**
5+
**Total number of tasks: 158**
66

77
The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`.
88

@@ -67,6 +67,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not
6767
- [HumanEval](HumanEval.md)
6868
- [HumanEvalBPB](HumanEvalBPB.md)
6969
- [HumanEvalInstruct](HumanEvalInstruct.md)
70+
- [HumanEval_OLMES](HumanEval_OLMES.md)
7071
- [IFEval](IFEval.md)
7172
- [IFEvalDe](IFEvalDe.md)
7273
- [IFEvalFiSv](IFEvalFiSv.md)

src/eval_framework/tasks/benchmarks/humaneval.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,28 @@ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
9999
return [gt] if gt else None
100100

101101

102+
class HumanEval_OLMES(HumanEval):
103+
"""HumanEval OLMES variant replicating codex_humaneval:3shot::olmo3:n32:v2 from oe_eval.
104+
105+
Recommended EvalConfig settings for full replication:
106+
repeats: 32
107+
llm_args: {sampling_params: {temperature: 0.6, top_p: 0.6}}
108+
"""
109+
110+
NAME = "Human Eval OLMES"
111+
112+
def __init__(self, num_fewshot: int = 3) -> None:
113+
super().__init__(num_fewshot)
114+
self.stop_sequences = ["\nclass", "\nif", "\nprint", "\n#", "\n```", "\n```\n\n", "<|eot_id|>"]
115+
self.max_tokens = 1024
116+
117+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
118+
return "```python\n" + item["prompt"]
119+
120+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
121+
return item["canonical_solution"] + "```"
122+
123+
102124
class HumanEvalInstruct(HumanEval):
103125
# See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml
104126
NAME = "Human Eval Instruct"

src/eval_framework/tasks/task_names.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def register_all_tasks() -> None:
6161
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_FR")
6262
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval")
6363
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalBPB")
64+
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval_OLMES")
6465
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalInstruct")
6566
register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEval")
6667
register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalDe")

tests/tests_eval_framework/tasks/task-prompts-hashes.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@
114114
"HumanEvalBPB.Llama3Formatter": "57f5a23f0cf320ea2c675c178e9f5bad",
115115
"HumanEvalInstruct.ConcatFormatter": "100e994d25219d93daa3ace8a8beb730",
116116
"HumanEvalInstruct.Llama3Formatter": "0c1c4e07c9ecd0445257118bc5cecc09",
117+
"HumanEval_OLMES.ConcatFormatter": "43d5d2b350304df54708165cf30e4009",
118+
"HumanEval_OLMES.Llama3Formatter": "0595b2b93a403ceb3cfef74baa4dd7bf",
117119
"IFEval.ConcatFormatter": "b517d9d281cd8d5db2ea72b47bd44314",
118120
"IFEval.Llama3Formatter": "7739c2862af662f2b146f4eae61ac208",
119121
"IFEvalDe.ConcatFormatter": "798e567efa346a45b42deff904a40b22",

tests/tests_eval_framework/tasks/test_all_formatters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
"HELLASWAG_DE": {"num_fewshot": 1},
5555
"HELLASWAG_EU20_DE": {"num_fewshot": 1},
5656
"HELLASWAG_EU20_FR": {"num_fewshot": 1},
57+
"HumanEval_OLMES": {"num_fewshot": 3},
5758
"InfiniteBench_CodeDebug": {"num_fewshot": 0},
5859
"InfiniteBench_CodeRun": {"num_fewshot": 0},
5960
"InfiniteBench_EnDia": {"num_fewshot": 0},

tests/tests_eval_framework/tasks/test_humaneval.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pytest
22

3-
from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEvalInstruct
3+
from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEval_OLMES, HumanEvalInstruct
44
from eval_framework.tasks.utils import run_python_code
55
from tests.tests_eval_framework.utils import DatasetPatcher
66

@@ -24,6 +24,49 @@ def test_code_is_executed(self, human_eval_task: HumanEval) -> None:
2424
assert i == 9
2525

2626

27+
class TestHumanEvalOLMES:
28+
@pytest.fixture
29+
def human_eval_olmes_task(self) -> HumanEval_OLMES:
30+
with DatasetPatcher(HumanEval_OLMES, num_fewshot=3) as patched_task:
31+
return patched_task
32+
33+
def test_code_is_executed(self, human_eval_olmes_task: HumanEval_OLMES) -> None:
34+
assert len(human_eval_olmes_task.SUBJECTS) > 0
35+
subject = human_eval_olmes_task.SUBJECTS[0]
36+
human_eval_olmes_task._load_dataset(subject)
37+
i = 0
38+
for i, item in enumerate(human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][:10]):
39+
item["subject"] = subject
40+
sample = human_eval_olmes_task._create_samples(item, i, subject)[0]
41+
formatted_code = human_eval_olmes_task.post_process_generated_completion(item["canonical_solution"], sample)
42+
assert run_python_code(formatted_code).endswith("True")
43+
formatted_code = human_eval_olmes_task.post_process_generated_completion("", sample)
44+
assert not run_python_code(formatted_code).endswith("True")
45+
assert i == 9
46+
47+
def test_olmes_settings(self, human_eval_olmes_task: HumanEval_OLMES) -> None:
48+
assert human_eval_olmes_task.num_fewshot == 3
49+
assert human_eval_olmes_task.max_tokens == 1024
50+
assert "\nclass" in human_eval_olmes_task.stop_sequences
51+
assert "\nif" in human_eval_olmes_task.stop_sequences
52+
assert "\nprint" in human_eval_olmes_task.stop_sequences
53+
assert "\n#" in human_eval_olmes_task.stop_sequences
54+
assert "\n```" in human_eval_olmes_task.stop_sequences
55+
assert human_eval_olmes_task.SAMPLE_SPLIT == "test"
56+
assert human_eval_olmes_task.FEWSHOT_SPLIT == "test"
57+
58+
def test_olmes_prompt_format(self, human_eval_olmes_task: HumanEval_OLMES) -> None:
59+
human_eval_olmes_task._load_dataset(human_eval_olmes_task.SUBJECTS[0])
60+
item = human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][0]
61+
instruction = human_eval_olmes_task._get_instruction_text(item)
62+
assert instruction.startswith("```python\n")
63+
assert instruction == "```python\n" + item["prompt"]
64+
65+
fewshot_target = human_eval_olmes_task._get_fewshot_target_text(item)
66+
assert fewshot_target.endswith("```")
67+
assert fewshot_target == item["canonical_solution"] + "```"
68+
69+
2770
class TestHumanEvalInstructCode:
2871
@pytest.fixture
2972
def human_eval_task_inst(self) -> HumanEvalInstruct:

0 commit comments

Comments
 (0)