feat: add HumanEval benchmark (164 function completion problems, pass@1)

jundot · jundot · commit 338f98a02747 · 2026-03-22T19:17:30.000+09:00
diff --git a/omlx/admin/accuracy_benchmark.py b/omlx/admin/accuracy_benchmark.py
@@ -32,7 +32,7 @@
 _current_model: Optional[str] = None
 _engine_pool_ref: Any = None
 
-VALID_BENCHMARKS = ["mmlu", "hellaswag", "truthfulqa", "gsm8k", "livecodebench"]
+VALID_BENCHMARKS = ["mmlu", "hellaswag", "truthfulqa", "gsm8k", "humaneval", "livecodebench"]
 
 
 class AccuracyBenchmarkRequest(BaseModel):
diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js
@@ -297,13 +297,14 @@
 
             // Accuracy benchmark state
             accModelId: '',
-            accBenchmarks: { mmlu: true, hellaswag: true, truthfulqa: true, gsm8k: false, livecodebench: false },
-            accSampleSizes: { mmlu: 300, hellaswag: 200, truthfulqa: 200, gsm8k: 100, livecodebench: 100 },
+            accBenchmarks: { mmlu: true, hellaswag: false, truthfulqa: true, gsm8k: false, humaneval: true, livecodebench: false },
+            accSampleSizes: { mmlu: 1000, hellaswag: 200, truthfulqa: 0, gsm8k: 100, humaneval: 0, livecodebench: 100 },
             accBenchmarkList: [
                 { key: 'mmlu', label: 'MMLU', desc: 'Knowledge · 57 subjects', fullSize: 14042, sizes: [30, 50, 100, 200, 300, 500, 1000, 2000] },
                 { key: 'hellaswag', label: 'HellaSwag', desc: 'Commonsense reasoning', fullSize: 10042, sizes: [30, 50, 100, 200, 300, 500, 1000, 2000] },
                 { key: 'truthfulqa', label: 'TruthfulQA', desc: 'Truthfulness', fullSize: 817, sizes: [30, 50, 100, 200, 300] },
                 { key: 'gsm8k', label: 'GSM8K', desc: 'Math reasoning', fullSize: 1319, sizes: [30, 50, 100, 200, 300] },
+                { key: 'humaneval', label: 'HumanEval', desc: 'Function completion', fullSize: 164, sizes: [30, 50, 100] },
                 { key: 'livecodebench', label: 'LiveCodeBench', desc: 'Code generation', fullSize: 1055, sizes: [30, 50, 100, 200, 300] },
             ],
             accBatchSize: 1,
diff --git a/omlx/admin/templates/dashboard/_bench_accuracy.html b/omlx/admin/templates/dashboard/_bench_accuracy.html
@@ -66,7 +66,7 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('acc_bench.h
                                                 <option value="0" x-text="'Full (' + b.fullSize.toLocaleString() + ')'"></option>
                                             </select>
                                             <!-- Code exec warning badge -->
-                                            <template x-if="b.key === 'livecodebench'">
+                                            <template x-if="b.key === 'livecodebench' || b.key === 'humaneval'">
                                                 <div class="mt-2">
                                                     <span class="inline-flex items-center gap-1 px-1.5 py-0.5 text-[10px] font-medium rounded"
                                                           :class="accBenchmarks[b.key] ? 'bg-amber-400/20 text-amber-200' : 'bg-amber-50 text-amber-500'"
diff --git a/omlx/eval/__init__.py b/omlx/eval/__init__.py
@@ -8,6 +8,7 @@
 from .base import BaseBenchmark, BenchmarkResult, QuestionResult
 from .gsm8k import GSM8KBenchmark
 from .hellaswag import HellaSwagBenchmark
+from .humaneval import HumanEvalBenchmark
 from .livecodebench import LiveCodeBenchBenchmark
 from .mmlu import MMLUBenchmark
 from .truthfulqa import TruthfulQABenchmark
@@ -17,6 +18,7 @@
     "hellaswag": HellaSwagBenchmark,
     "truthfulqa": TruthfulQABenchmark,
     "gsm8k": GSM8KBenchmark,
+    "humaneval": HumanEvalBenchmark,
     "livecodebench": LiveCodeBenchBenchmark,
 }
 
@@ -29,5 +31,6 @@
     "HellaSwagBenchmark",
     "TruthfulQABenchmark",
     "GSM8KBenchmark",
+    "HumanEvalBenchmark",
     "LiveCodeBenchBenchmark",
 ]
diff --git a/omlx/eval/data/humaneval.jsonl b/omlx/eval/data/humaneval.jsonl
diff --git a/omlx/eval/humaneval.py b/omlx/eval/humaneval.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: Apache-2.0
+"""HumanEval benchmark.
+
+Tests code generation ability using function completion problems.
+Model receives a function signature + docstring and must complete the body.
+Verification: generated code + unit tests run in sandboxed subprocess.
+Dataset bundled from openai/openai_humaneval on HuggingFace (164 problems).
+
+SECURITY NOTE: This benchmark executes model-generated code on the local
+machine. Mitigations: subprocess with timeout, memory limits, temp file cleanup.
+"""
+
+import asyncio
+import json
+import logging
+import os
+import re
+import resource
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+from typing import Any, Callable, Optional
+
+from .base import BaseBenchmark, BenchmarkResult, QuestionResult
+from .datasets import deterministic_sample, load_jsonl
+
+logger = logging.getLogger(__name__)
+
+DATA_DIR = Path(__file__).parent / "data"
+
+EXEC_TIMEOUT_SECONDS = 15
+EXEC_MEMORY_LIMIT_BYTES = 256 * 1024 * 1024  # 256 MB
+
+
+def _extract_code(response: str, prompt: str) -> str:
+    """Extract the function body from model response.
+
+    The model may return the full function (including signature) or just the body.
+    We need to combine it with the original prompt to form a complete function.
+    """
+    response = response.strip()
+
+    # If response contains a code block, extract it
+    match = re.search(r"```python\s*\n(.*?)```", response, re.DOTALL)
+    if match:
+        code = match.group(1).strip()
+        # If the code block contains the function def, use it standalone
+        if "def " in code:
+            return code
+        # Otherwise it's just the body, combine with prompt
+        return prompt + code
+
+    match = re.search(r"```\s*\n(.*?)```", response, re.DOTALL)
+    if match:
+        code = match.group(1).strip()
+        if "def " in code:
+            return code
+        return prompt + code
+
+    # No code block — response is the continuation of the prompt
+    # Check if response starts with the function def (model repeated the signature)
+    if response.startswith("def ") or response.startswith("from ") or response.startswith("import "):
+        return response
+
+    # Response is just the function body — combine with prompt
+    return prompt + response
+
+
+def _set_resource_limits():
+    """Set resource limits for subprocess."""
+    try:
+        resource.setrlimit(resource.RLIMIT_AS, (EXEC_MEMORY_LIMIT_BYTES, EXEC_MEMORY_LIMIT_BYTES))
+    except (ValueError, resource.error):
+        pass
+    try:
+        resource.setrlimit(resource.RLIMIT_CPU, (EXEC_TIMEOUT_SECONDS + 5, EXEC_TIMEOUT_SECONDS + 5))
+    except (ValueError, resource.error):
+        pass
+
+
+def _execute_with_tests(code: str, test_code: str, entry_point: str) -> tuple[bool, str]:
+    """Execute generated code with test cases.
+
+    Combines the generated function with test assertions and runs in subprocess.
+
+    Returns:
+        (passed, error_message)
+    """
+    # Build the complete test script
+    script = f"""{code}
+
+{test_code}
+
+check({entry_point})
+"""
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+        f.write(script)
+        tmp_path = f.name
+
+    try:
+        result = subprocess.run(
+            ["python3", tmp_path],
+            capture_output=True,
+            text=True,
+            timeout=EXEC_TIMEOUT_SECONDS,
+            preexec_fn=_set_resource_limits,
+            env={
+                "PATH": os.environ.get("PATH", "/usr/bin:/usr/local/bin"),
+                "HOME": os.environ.get("HOME", "/tmp"),
+                "LANG": "en_US.UTF-8",
+            },
+        )
+        if result.returncode == 0:
+            return True, ""
+        else:
+            return False, result.stderr[:500]
+    except subprocess.TimeoutExpired:
+        return False, "Execution timed out"
+    except Exception as e:
+        return False, str(e)[:500]
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except OSError:
+            pass
+
+
+class HumanEvalBenchmark(BaseBenchmark):
+    """HumanEval: function completion with unit test verification."""
+
+    name = "humaneval"
+    quick_size = 100
+
+    async def load_dataset(self, sample_size: int = 0) -> list[dict]:
+        """Load HumanEval from bundled data."""
+        items = load_jsonl(DATA_DIR / "humaneval.jsonl")
+
+        normalized = []
+        for item in items:
+            normalized.append({
+                "id": item["task_id"],
+                "prompt": item["prompt"],
+                "test": item["test"],
+                "entry_point": item["entry_point"],
+                "question": item["prompt"],  # for get_question_text
+            })
+
+        logger.info(f"HumanEval: loaded {len(normalized)} problems")
+
+        if sample_size == 0:
+            return normalized
+
+        return deterministic_sample(normalized, sample_size)
+
+    def get_max_tokens(self) -> int:
+        return 512
+
+    def format_prompt(self, item: dict) -> list[dict[str, str]]:
+        """Format as a function completion prompt."""
+        prompt = item["prompt"]
+        content = (
+            "Complete the following Python function. "
+            "Provide only the complete function implementation, no explanations.\n\n"
+            f"{prompt}"
+        )
+        return [{"role": "user", "content": content}]
+
+    def extract_answer(self, response: str, item: dict) -> str:
+        """Extract the complete function from model response."""
+        return _extract_code(response, item["prompt"])
+
+    def check_answer(self, predicted: str, item: dict) -> bool:
+        """Execute the generated code with test cases."""
+        if not predicted.strip():
+            return False
+
+        passed, error = _execute_with_tests(
+            predicted, item["test"], item["entry_point"]
+        )
+        return passed
+
+    async def run(
+        self,
+        engine: Any,
+        items: list[dict],
+        on_progress: Optional[Callable[[int, int], Any]] = None,
+        batch_size: int = 1,
+        sampling_kwargs: Optional[dict] = None,
+    ) -> BenchmarkResult:
+        """Override run: generation is batched, code execution is sequential."""
+        results: list[QuestionResult] = []
+        correct = 0
+        start_time = time.time()
+        completed = 0
+
+        for batch_start in range(0, len(items), batch_size):
+            batch_end = min(batch_start + batch_size, len(items))
+            batch = items[batch_start:batch_end]
+            batch_time = time.time()
+
+            gen_tasks = [
+                self._eval_single(engine, item, batch_start + j, sampling_kwargs)
+                for j, item in enumerate(batch)
+            ]
+            gen_results = await asyncio.gather(*gen_tasks)
+            gen_elapsed = time.time() - batch_time
+
+            for idx, item, response_text, prompt_text in sorted(gen_results, key=lambda x: x[0]):
+                code = self.extract_answer(response_text, item)
+                is_correct = self.check_answer(code, item)
+
+                if is_correct:
+                    correct += 1
+
+                results.append(
+                    QuestionResult(
+                        question_id=str(item.get("id", idx)),
+                        correct=is_correct,
+                        expected="(unit tests)",
+                        predicted=code[:200] + "..." if len(code) > 200 else code,
+                        time_seconds=gen_elapsed / len(batch),
+                        question_text=prompt_text,
+                        raw_response=response_text,
+                    )
+                )
+
+            completed += len(batch)
+            if on_progress:
+                await on_progress(completed, len(items))
+
+        total_time = time.time() - start_time
+        total = len(items)
+
+        return BenchmarkResult(
+            benchmark_name=self.name,
+            accuracy=correct / total if total > 0 else 0.0,
+            total_questions=total,
+            correct_count=correct,
+            time_seconds=total_time,
+            question_results=results,
+        )
diff --git a/tests/test_accuracy_benchmark.py b/tests/test_accuracy_benchmark.py
@@ -58,7 +58,7 @@ def test_all_valid_benchmarks(self):
             model_id="test-model",
             benchmarks={b: 100 for b in VALID_BENCHMARKS},
         )
-        assert len(req.benchmarks) == 5
+        assert len(req.benchmarks) == 6
 
 
 class TestQueueAndResults:
diff --git a/tests/test_eval.py b/tests/test_eval.py
@@ -202,6 +202,40 @@ def test_extract_code_empty(self):
         assert code == ""
 
 
+# --- HumanEval Tests ---
+
+
+class TestHumanEval:
+    def test_extract_code_with_block(self):
+        from omlx.eval.humaneval import _extract_code
+        prompt = "def add(a, b):\n    "
+        response = "```python\ndef add(a, b):\n    return a + b\n```"
+        code = _extract_code(response, prompt)
+        assert "return a + b" in code
+
+    def test_extract_code_body_only(self):
+        from omlx.eval.humaneval import _extract_code
+        prompt = "def add(a, b):\n    "
+        response = "return a + b"
+        code = _extract_code(response, prompt)
+        assert "def add(a, b):" in code
+        assert "return a + b" in code
+
+    def test_execute_with_tests(self):
+        from omlx.eval.humaneval import _execute_with_tests
+        code = "def add(a, b):\n    return a + b"
+        test = "def check(candidate):\n    assert candidate(1, 2) == 3\n    assert candidate(0, 0) == 0"
+        passed, error = _execute_with_tests(code, test, "add")
+        assert passed is True
+
+    def test_execute_with_tests_fail(self):
+        from omlx.eval.humaneval import _execute_with_tests
+        code = "def add(a, b):\n    return a - b"  # wrong
+        test = "def check(candidate):\n    assert candidate(1, 2) == 3"
+        passed, error = _execute_with_tests(code, test, "add")
+        assert passed is False
+
+
 # --- Think Tag Stripping Tests ---
 
 

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def test_all_valid_benchmarks(self):`
`58`	`58`	`model_id="test-model",`
`59`	`59`	`benchmarks={b: 100 for b in VALID_BENCHMARKS},`
`60`	`60`	`)`
`61`		`- assert len(req.benchmarks) == 5`
	`61`	`+ assert len(req.benchmarks) == 6`
`62`	`62`
`63`	`63`
`64`	`64`	`class TestQueueAndResults:`