Skip to content

Commit 2c0d7bd

Browse files
wasiahmadKipok
andauthored
Evaluation on LiveCodeBench (#517)
Signed-off-by: Igor Gitman <igitman@nvidia.com> Co-authored-by: Igor Gitman <igitman@nvidia.com>
1 parent 1bcaceb commit 2c0d7bd

13 files changed

Lines changed: 407 additions & 10 deletions

File tree

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,6 @@ __pycache__
3535
cluster_configs/*
3636
!cluster_configs/example-*.yaml
3737

38-
nemo_skills/dataset/ruler/*/
38+
nemo_skills/dataset/ruler/*/
39+
.idea/
40+
.idea/*
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# settings that define how evaluation should be done by default (all can be changed from cmdline)
16+
PROMPT_CONFIG = 'eval/livecodebench/python_codegen'
17+
DATASET_GROUP = 'code'
18+
METRICS_TYPE = 'livecodebench'
19+
EVAL_SPLIT = 'test_v5_2408_2502'
20+
EVAL_ARGS = "++eval_type=livecodebench ++eval_config.dataset=livecodebench"
21+
GENERATION_ARGS = ""
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
import json
17+
import os
18+
from datetime import datetime
19+
from pathlib import Path
20+
21+
from datasets import load_dataset
22+
from dateutil.relativedelta import relativedelta
23+
24+
25+
class PromptConstants:
26+
# reference: https://github.com/QwenLM/Qwen2.5-Coder/blob/main/qwencoder-eval/reasoning/livecode_bench_cot/lcb_runner_cq/prompts/code_generation.py#L31
27+
FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
28+
FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the python program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
29+
30+
31+
def parse_data(release_version='release_latest'):
32+
data = load_dataset(
33+
"livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True
34+
)
35+
# data has the following fields
36+
# question_title: str
37+
# question_content: str
38+
# platform: Platform
39+
# question_id: str
40+
# contest_id: str
41+
# contest_date: datetime
42+
# starter_code: str
43+
# difficulty: Difficulty
44+
# public_test_cases: list[Test]
45+
# private_test_cases: list[Test]
46+
# metadata: dict
47+
return data
48+
49+
50+
def get_first_last_day(year_month_str):
51+
try:
52+
date_obj = datetime.strptime(year_month_str, "%Y-%m")
53+
first_day = date_obj.date().replace(day=1)
54+
last_day = (date_obj + relativedelta(months=1, days=-1)).date()
55+
return first_day, last_day
56+
except ValueError:
57+
raise ValueError("Invalid date format. Please use '%Y-%m'.")
58+
59+
60+
def parse_month_range(start_date, end_date):
61+
try:
62+
start_date, _ = get_first_last_day(start_date)
63+
_, end_date = get_first_last_day(end_date)
64+
return start_date, end_date
65+
except ValueError as e:
66+
raise ValueError(str(e))
67+
68+
69+
def clean_data(dataset):
70+
def map_fn(data):
71+
question = data["question_content"] + "\n\n"
72+
if data["starter_code"]:
73+
question += f"{PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
74+
question += f"```python\n{data['starter_code']}\n```\n\n"
75+
else:
76+
question += f"{PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n\n"
77+
question += f"```python\n# YOUR CODE HERE\n```\n\n"
78+
79+
data["task_id"] = data["question_id"]
80+
data['question'] = question.replace(' ', '\t')
81+
return data
82+
83+
remove_columns = [
84+
'question_title',
85+
'contest_id',
86+
'public_test_cases',
87+
'private_test_cases',
88+
'metadata',
89+
'question_content',
90+
'platform',
91+
'question_id',
92+
'starter_code',
93+
]
94+
dataset = dataset.map(map_fn, remove_columns=remove_columns)
95+
return dataset
96+
97+
98+
def prepare(start_date, end_date, release_version, output_dir):
99+
start_date, end_date = parse_month_range(start_date, end_date)
100+
start_yymm = start_date.strftime("%y%m")
101+
end_yymm = end_date.strftime("%y%m")
102+
output_file_path = os.path.join(output_dir, f"test_{release_version}_{start_yymm}_{end_yymm}.jsonl")
103+
104+
assert release_version in ["v1", "v2", "v3", "v4", "v5", "v6"]
105+
106+
data = parse_data(release_version=f"release_{release_version}")
107+
data = clean_data(data)
108+
print("Len of data: ", len(data))
109+
110+
print("Writing to file...")
111+
if not os.path.exists(output_dir):
112+
os.makedirs(output_dir)
113+
114+
with open(output_file_path, 'w') as f:
115+
for problem in data:
116+
input_date = datetime.strptime(problem['contest_date'], '%Y-%m-%dT%H:%M:%S').date()
117+
if start_date <= input_date <= end_date:
118+
json.dump(
119+
{
120+
"task_id": problem["task_id"],
121+
"question": problem["question"],
122+
"difficulty": problem["difficulty"],
123+
"subset_for_metrics": problem["difficulty"],
124+
},
125+
f,
126+
)
127+
f.write('\n')
128+
129+
130+
DEFAULT_SPLITS = [
131+
('v5', '2024-08', '2025-02'),
132+
('v5', '2024-10', '2025-02'),
133+
('v5', '2024-10', '2025-04'),
134+
('v6', '2024-08', '2025-02'),
135+
('v6', '2024-10', '2025-02'),
136+
('v6', '2024-10', '2025-04'),
137+
]
138+
139+
140+
if __name__ == '__main__':
141+
# Write an argparse to a json file, read it in and parse it
142+
parser = argparse.ArgumentParser()
143+
parser.add_argument('--output_dir', type=str, default=str(Path(__file__).parent))
144+
parser.add_argument('--release_version', type=str, default='all')
145+
parser.add_argument('--start_date', type=str, default='all', help="End date in YYYY-MM format")
146+
parser.add_argument('--end_date', type=str, default='all', help="End date in YYYY-MM format")
147+
148+
args = parser.parse_args()
149+
150+
if args.release_version == 'all' and args.start_date == 'all' and args.end_date == 'all':
151+
# Prepare all splits
152+
for release_version, start_date, end_date in DEFAULT_SPLITS:
153+
print(f"Processing data for {release_version} from {start_date} to {end_date}")
154+
prepare(start_date, end_date, release_version, args.output_dir)
155+
else:
156+
if args.release_version == 'all' or args.start_date == 'all' or args.end_date == 'all':
157+
raise ValueError(
158+
"If preparing a custom split, you must specify all "
159+
"--release_version, --start_date, and --end_date arguments."
160+
)
161+
prepare(args.start_date, args.end_date, args.release_version, args.output_dir)
162+
163+
# test_v5_2408_2502.jsonl: 279 samples
164+
# test_v5_2410_2502.jsonl: 166 samples
165+
# test_v5_2410_2504.jsonl: 166 samples
166+
# test_v6_2408_2502.jsonl: 374 samples
167+
# test_v6_2410_2502.jsonl: 261 samples
168+
# test_v6_2410_2504.jsonl: 341 samples
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import json
17+
import logging
18+
import shutil
19+
import subprocess
20+
import sys
21+
22+
from nemo_skills.evaluation.code_utils import preprocess_code
23+
from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
24+
25+
LOG = logging.getLogger(get_logger_name(__file__))
26+
27+
28+
def install_from_git(git_url):
29+
try:
30+
subprocess.check_call([sys.executable, "-m", "pip", "install", git_url])
31+
print("Package installed successfully!")
32+
except subprocess.CalledProcessError as e:
33+
print(f"Error during installation: {e}")
34+
35+
36+
# TODO: use sandbox
37+
@nested_dataclass(kw_only=True)
38+
class LiveCodeBenchEvaluatorConfig:
39+
dataset: str = "livecodebench"
40+
language: str = "python" # "cpp" is another option now
41+
release_version: str = "v5"
42+
test_file: str = None
43+
44+
45+
def eval_livecodebench(cfg):
46+
try:
47+
from livecodebench.evaluate import evaluate
48+
except ImportError:
49+
LOG.info("Package 'livecodebench' not found. Attempting to install...")
50+
install_from_git("git+https://github.com/wasiahmad/livecodebench.git")
51+
try:
52+
from livecodebench.evaluate import evaluate
53+
except ImportError:
54+
LOG.info("Failed to install 'livecodebench'. Please install it manually.")
55+
raise
56+
57+
eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config)
58+
assert eval_config.language in ["python", "cpp"]
59+
if eval_config.language == "cpp":
60+
assert eval_config.test_file is not None
61+
62+
for jsonl_file in unroll_files(cfg.input_files):
63+
with open(jsonl_file) as f:
64+
samples = [preprocess_code(json.loads(line), eval_config.language) for line in f]
65+
for sample in samples:
66+
sample["question_id"] = sample["task_id"]
67+
sample["code_list"] = [sample["completion"]]
68+
with open(jsonl_file, "wt", encoding="utf-8") as f:
69+
for sample in samples:
70+
f.write(json.dumps(sample) + "\n")
71+
72+
# https://github.com/wasiahmad/livecodebench/blob/main/livecodebench/evaluate.py#L10
73+
evaluate(
74+
custom_output_file=jsonl_file,
75+
release_version=f"release_{eval_config.release_version}",
76+
k_list=[1],
77+
language=eval_config.language,
78+
test_file=None if eval_config.language == "python" else eval_config.test_file,
79+
num_process_evaluate=12,
80+
timeout=6 if eval_config.language == "python" else 30,
81+
)
82+
83+
with open(jsonl_file[:-6] + '_eval_results.json', 'rt', encoding="utf-8") as fin:
84+
eval_grades = json.load(fin)
85+
# adding is_correct key to allow compute_metrics to work
86+
with open(jsonl_file, "wt", encoding="utf-8") as f:
87+
for sample in samples:
88+
sample['graded_list'] = eval_grades['eval'][sample['task_id']]['graded_list']
89+
f.write(json.dumps(sample) + "\n")
90+
91+
# moving eval file to ensure metrics are recomputed
92+
shutil.move(jsonl_file[:-6] + '_eval_results.json', jsonl_file[:-6] + '_eval_results-saved.json')

nemo_skills/evaluation/code_utils.py

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,58 @@
1-
def preprocess_code(generation_dict: dict):
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import re
16+
17+
18+
def preprocess_code(generation_dict: dict, language="python"):
219
completion = generation_dict['generation']
320
completion = completion.strip()
421
completion = completion.replace("\r", "")
5-
if '```' in completion:
6-
if '```python' in completion:
7-
def_line = completion.index('```python') + len('```python')
22+
23+
##### To handle code generation by reasoning models
24+
# check for <think> and </think> tags
25+
if "<think>" in completion:
26+
if "</think>" in completion:
27+
# thinking trace completed, solution in after the trace
28+
match = re.search(r"</think>\s*(.*)", completion, re.DOTALL)
29+
completion = match.group(1).strip() if match else None
830
else:
9-
def_line = completion.index('```') + len('```')
31+
completion = None
32+
33+
if completion is None:
34+
generation_dict["completion"] = "" # no valid solution generated
35+
return generation_dict
36+
#####
37+
38+
start_with_lang_tag = f'```{language}'
39+
generic_start_end_tag = f'```'
40+
41+
if start_with_lang_tag in completion:
42+
def_line = completion.index(start_with_lang_tag) + len(start_with_lang_tag)
43+
completion = completion[def_line:].strip()
44+
try:
45+
next_line = completion.index(generic_start_end_tag)
46+
completion = completion[:next_line].strip()
47+
except:
48+
print(completion)
49+
print("================\n")
50+
51+
elif generic_start_end_tag in completion:
52+
def_line = completion.index(generic_start_end_tag) + len(generic_start_end_tag)
1053
completion = completion[def_line:].strip()
11-
completion = completion.replace('```python', '')
1254
try:
13-
next_line = completion.index('```')
55+
next_line = completion.index(generic_start_end_tag)
1456
completion = completion[:next_line].strip()
1557
except:
1658
print(completion)

nemo_skills/evaluation/evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from tqdm import tqdm
2727

2828
from nemo_skills.code_execution.sandbox import get_sandbox
29+
from nemo_skills.evaluation.code_evaluators.livecodebench import eval_livecodebench
2930
from nemo_skills.evaluation.constants import JUDGE_MODEL
3031
from nemo_skills.evaluation.math_grader import batch_evaluate_results, extract_answer
3132
from nemo_skills.inference.server.model import get_model
@@ -34,6 +35,7 @@
3435

3536
LOG = logging.getLogger(get_logger_name(__file__))
3637

38+
3739
# TODO: split into multiple files
3840

3941

@@ -484,6 +486,7 @@ def string_match_part_single(preds, refs):
484486
'lean4-statement': eval_lean4_statement,
485487
'multichoice': eval_mcq,
486488
'ruler': eval_ruler,
489+
'livecodebench': eval_livecodebench,
487490
}
488491

489492

nemo_skills/evaluation/metrics/code_metrics.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,14 @@ def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
2727
def update(self, predictions):
2828
super().update(predictions)
2929
self._compute_pass_at_k(predictions=predictions)
30+
31+
32+
class LiveCodeBenchMetrics(BaseMetrics):
33+
def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
34+
return {
35+
"accuracy": prediction['graded_list'][0],
36+
}
37+
38+
def update(self, predictions):
39+
super().update(predictions)
40+
self._compute_pass_at_k(predictions=predictions)

0 commit comments

Comments
 (0)