NVIDIA-NeMo
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎nemo_skills/dataset/livecodebench/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎nemo_skills/dataset/livecodebench/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎nemo_skills/dataset/livecodebench/prepare.py‎
Lines changed: 168 additions & 0 deletions b/‎nemo_skills/dataset/livecodebench/prepare.py‎
Lines changed: 168 additions & 0 deletions
diff --git a/‎nemo_skills/evaluation/code_evaluators/livecodebench.py‎
Lines changed: 92 additions & 0 deletions b/‎nemo_skills/evaluation/code_evaluators/livecodebench.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎nemo_skills/evaluation/code_utils.py‎
Lines changed: 49 additions & 7 deletions b/‎nemo_skills/evaluation/code_utils.py‎
Lines changed: 49 additions & 7 deletions
diff --git a/‎nemo_skills/evaluation/evaluator.py‎
Lines changed: 3 additions & 0 deletions b/‎nemo_skills/evaluation/evaluator.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎nemo_skills/evaluation/metrics/code_metrics.py‎
Lines changed: 11 additions & 0 deletions b/‎nemo_skills/evaluation/metrics/code_metrics.py‎
Lines changed: 11 additions & 0 deletions
@@ -35,4 +35,6 @@ __pycache__
 cluster_configs/*
 !cluster_configs/example-*.yaml
 
-nemo_skills/dataset/ruler/*/
+nemo_skills/dataset/ruler/*/
+.idea/
+.idea/*
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+PROMPT_CONFIG = 'eval/livecodebench/python_codegen'
+DATASET_GROUP = 'code'
+METRICS_TYPE = 'livecodebench'
+EVAL_SPLIT = 'test_v5_2408_2502'
+EVAL_ARGS = "++eval_type=livecodebench ++eval_config.dataset=livecodebench"
+GENERATION_ARGS = ""
@@ -0,0 +1,168 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+from datasets import load_dataset
+from dateutil.relativedelta import relativedelta
+
+
+class PromptConstants:
+    # reference: https://github.com/QwenLM/Qwen2.5-Coder/blob/main/qwencoder-eval/reasoning/livecode_bench_cot/lcb_runner_cq/prompts/code_generation.py#L31
+    FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
+    FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the python program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
+
+
+def parse_data(release_version='release_latest'):
+    data = load_dataset(
+        "livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True
+    )
+    # data has the following fields
+    # question_title: str
+    # question_content: str
+    # platform: Platform
+    # question_id: str
+    # contest_id: str
+    # contest_date: datetime
+    # starter_code: str
+    # difficulty: Difficulty
+    # public_test_cases: list[Test]
+    # private_test_cases: list[Test]
+    # metadata: dict
+    return data
+
+
+def get_first_last_day(year_month_str):
+    try:
+        date_obj = datetime.strptime(year_month_str, "%Y-%m")
+        first_day = date_obj.date().replace(day=1)
+        last_day = (date_obj + relativedelta(months=1, days=-1)).date()
+        return first_day, last_day
+    except ValueError:
+        raise ValueError("Invalid date format. Please use '%Y-%m'.")
+
+
+def parse_month_range(start_date, end_date):
+    try:
+        start_date, _ = get_first_last_day(start_date)
+        _, end_date = get_first_last_day(end_date)
+        return start_date, end_date
+    except ValueError as e:
+        raise ValueError(str(e))
+
+
+def clean_data(dataset):
+    def map_fn(data):
+        question = data["question_content"] + "\n\n"
+        if data["starter_code"]:
+            question += f"{PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
+            question += f"```python\n{data['starter_code']}\n```\n\n"
+        else:
+            question += f"{PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n\n"
+            question += f"```python\n# YOUR CODE HERE\n```\n\n"
+
+        data["task_id"] = data["question_id"]
+        data['question'] = question.replace('    ', '\t')
+        return data
+
+    remove_columns = [
+        'question_title',
+        'contest_id',
+        'public_test_cases',
+        'private_test_cases',
+        'metadata',
+        'question_content',
+        'platform',
+        'question_id',
+        'starter_code',
+    ]
+    dataset = dataset.map(map_fn, remove_columns=remove_columns)
+    return dataset
+
+
+def prepare(start_date, end_date, release_version, output_dir):
+    start_date, end_date = parse_month_range(start_date, end_date)
+    start_yymm = start_date.strftime("%y%m")
+    end_yymm = end_date.strftime("%y%m")
+    output_file_path = os.path.join(output_dir, f"test_{release_version}_{start_yymm}_{end_yymm}.jsonl")
+
+    assert release_version in ["v1", "v2", "v3", "v4", "v5", "v6"]
+
+    data = parse_data(release_version=f"release_{release_version}")
+    data = clean_data(data)
+    print("Len of data: ", len(data))
+
+    print("Writing to file...")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    with open(output_file_path, 'w') as f:
+        for problem in data:
+            input_date = datetime.strptime(problem['contest_date'], '%Y-%m-%dT%H:%M:%S').date()
+            if start_date <= input_date <= end_date:
+                json.dump(
+                    {
+                        "task_id": problem["task_id"],
+                        "question": problem["question"],
+                        "difficulty": problem["difficulty"],
+                        "subset_for_metrics": problem["difficulty"],
+                    },
+                    f,
+                )
+                f.write('\n')
+
+
+DEFAULT_SPLITS = [
+    ('v5', '2024-08', '2025-02'),
+    ('v5', '2024-10', '2025-02'),
+    ('v5', '2024-10', '2025-04'),
+    ('v6', '2024-08', '2025-02'),
+    ('v6', '2024-10', '2025-02'),
+    ('v6', '2024-10', '2025-04'),
+]
+
+
+if __name__ == '__main__':
+    # Write an argparse to a json file, read it in and parse it
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_dir', type=str, default=str(Path(__file__).parent))
+    parser.add_argument('--release_version', type=str, default='all')
+    parser.add_argument('--start_date', type=str, default='all', help="End date in YYYY-MM format")
+    parser.add_argument('--end_date', type=str, default='all', help="End date in YYYY-MM format")
+
+    args = parser.parse_args()
+
+    if args.release_version == 'all' and args.start_date == 'all' and args.end_date == 'all':
+        # Prepare all splits
+        for release_version, start_date, end_date in DEFAULT_SPLITS:
+            print(f"Processing data for {release_version} from {start_date} to {end_date}")
+            prepare(start_date, end_date, release_version, args.output_dir)
+    else:
+        if args.release_version == 'all' or args.start_date == 'all' or args.end_date == 'all':
+            raise ValueError(
+                "If preparing a custom split, you must specify all "
+                "--release_version, --start_date, and --end_date arguments."
+            )
+        prepare(args.start_date, args.end_date, args.release_version, args.output_dir)
+
+    # test_v5_2408_2502.jsonl: 279 samples
+    # test_v5_2410_2502.jsonl: 166 samples
+    # test_v5_2410_2504.jsonl: 166 samples
+    # test_v6_2408_2502.jsonl: 374 samples
+    # test_v6_2410_2502.jsonl: 261 samples
+    # test_v6_2410_2504.jsonl: 341 samples
@@ -0,0 +1,92 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import logging
+import shutil
+import subprocess
+import sys
+
+from nemo_skills.evaluation.code_utils import preprocess_code
+from nemo_skills.utils import get_logger_name, nested_dataclass, unroll_files
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+def install_from_git(git_url):
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", git_url])
+        print("Package installed successfully!")
+    except subprocess.CalledProcessError as e:
+        print(f"Error during installation: {e}")
+
+
+# TODO: use sandbox
+@nested_dataclass(kw_only=True)
+class LiveCodeBenchEvaluatorConfig:
+    dataset: str = "livecodebench"
+    language: str = "python"  # "cpp" is another option now
+    release_version: str = "v5"
+    test_file: str = None
+
+
+def eval_livecodebench(cfg):
+    try:
+        from livecodebench.evaluate import evaluate
+    except ImportError:
+        LOG.info("Package 'livecodebench' not found. Attempting to install...")
+        install_from_git("git+https://github.com/wasiahmad/livecodebench.git")
+        try:
+            from livecodebench.evaluate import evaluate
+        except ImportError:
+            LOG.info("Failed to install 'livecodebench'. Please install it manually.")
+            raise
+
+    eval_config = LiveCodeBenchEvaluatorConfig(_init_nested=True, **cfg.eval_config)
+    assert eval_config.language in ["python", "cpp"]
+    if eval_config.language == "cpp":
+        assert eval_config.test_file is not None
+
+    for jsonl_file in unroll_files(cfg.input_files):
+        with open(jsonl_file) as f:
+            samples = [preprocess_code(json.loads(line), eval_config.language) for line in f]
+            for sample in samples:
+                sample["question_id"] = sample["task_id"]
+                sample["code_list"] = [sample["completion"]]
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                f.write(json.dumps(sample) + "\n")
+
+        # https://github.com/wasiahmad/livecodebench/blob/main/livecodebench/evaluate.py#L10
+        evaluate(
+            custom_output_file=jsonl_file,
+            release_version=f"release_{eval_config.release_version}",
+            k_list=[1],
+            language=eval_config.language,
+            test_file=None if eval_config.language == "python" else eval_config.test_file,
+            num_process_evaluate=12,
+            timeout=6 if eval_config.language == "python" else 30,
+        )
+
+        with open(jsonl_file[:-6] + '_eval_results.json', 'rt', encoding="utf-8") as fin:
+            eval_grades = json.load(fin)
+        # adding is_correct key to allow compute_metrics to work
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                sample['graded_list'] = eval_grades['eval'][sample['task_id']]['graded_list']
+                f.write(json.dumps(sample) + "\n")
+
+        # moving eval file to ensure metrics are recomputed
+        shutil.move(jsonl_file[:-6] + '_eval_results.json', jsonl_file[:-6] + '_eval_results-saved.json')
@@ -1,16 +1,58 @@
-def preprocess_code(generation_dict: dict):
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def preprocess_code(generation_dict: dict, language="python"):
     completion = generation_dict['generation']
     completion = completion.strip()
     completion = completion.replace("\r", "")
-    if '```' in completion:
-        if '```python' in completion:
-            def_line = completion.index('```python') + len('```python')
+
+    ##### To handle code generation by reasoning models
+    # check for <think> and </think> tags
+    if "<think>" in completion:
+        if "</think>" in completion:
+            # thinking trace completed, solution in after the trace
+            match = re.search(r"</think>\s*(.*)", completion, re.DOTALL)
+            completion = match.group(1).strip() if match else None
         else:
-            def_line = completion.index('```') + len('```')
+            completion = None
+
+    if completion is None:
+        generation_dict["completion"] = ""  # no valid solution generated
+        return generation_dict
+    #####
+
+    start_with_lang_tag = f'```{language}'
+    generic_start_end_tag = f'```'
+
+    if start_with_lang_tag in completion:
+        def_line = completion.index(start_with_lang_tag) + len(start_with_lang_tag)
+        completion = completion[def_line:].strip()
+        try:
+            next_line = completion.index(generic_start_end_tag)
+            completion = completion[:next_line].strip()
+        except:
+            print(completion)
+            print("================\n")
+
+    elif generic_start_end_tag in completion:
+        def_line = completion.index(generic_start_end_tag) + len(generic_start_end_tag)
         completion = completion[def_line:].strip()
-        completion = completion.replace('```python', '')
         try:
-            next_line = completion.index('```')
+            next_line = completion.index(generic_start_end_tag)
             completion = completion[:next_line].strip()
         except:
             print(completion)
 
@@ -26,6 +26,7 @@
 from tqdm import tqdm
 
 from nemo_skills.code_execution.sandbox import get_sandbox
+from nemo_skills.evaluation.code_evaluators.livecodebench import eval_livecodebench
 from nemo_skills.evaluation.constants import JUDGE_MODEL
 from nemo_skills.evaluation.math_grader import batch_evaluate_results, extract_answer
 from nemo_skills.inference.server.model import get_model
@@ -34,6 +35,7 @@
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
+
 # TODO: split into multiple files
 
 
@@ -484,6 +486,7 @@ def string_match_part_single(preds, refs):
     'lean4-statement': eval_lean4_statement,
     'multichoice': eval_mcq,
     'ruler': eval_ruler,
+    'livecodebench': eval_livecodebench,
 }
 
 
 
@@ -27,3 +27,14 @@ def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
     def update(self, predictions):
         super().update(predictions)
         self._compute_pass_at_k(predictions=predictions)
+
+
+class LiveCodeBenchMetrics(BaseMetrics):
+    def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
+        return {
+            "accuracy": prediction['graded_list'][0],
+        }
+
+    def update(self, predictions):
+        super().update(predictions)
+        self._compute_pass_at_k(predictions=predictions)