Skip to content

Commit ab260a1

Browse files
committed
feat: add KoIFEval Korean instruction-following benchmark
Add allganize/IFEval-Ko (342 samples) as a Korean instruction-following benchmark. Mirrors the structure of the existing English ifeval setup. The actual evaluator (Korean instruction checkers, sentence splitting, language detection, etc.) lives in a separate pip-installable package: https://github.com/bzantium/koifeval (Apache-2.0, derived from google-research/instruction_following_eval and allganize/IFEval-Ko, with attribution in LICENSE and NOTICE). Changes here: - nemo_skills/dataset/koifeval/: dataset prepare and __init__ (METRICS_TYPE: if). - nemo_skills/evaluation/evaluator/koifeval.py: thin subprocess wrapper that invokes 'python -m koifeval.evaluation_main' and merges strict / loose results back into the generation jsonl. Mirrors ifeval.py byte-for-byte except for the cmd line. - nemo_skills/evaluation/evaluator/__init__.py: register the koifeval eval_type. - dockerfiles/Dockerfile.nemo-skills: pip install koifeval pinned to commit 7a27889c0cf285e60b1a7b4795211846dc328aaf. Validated parity with the previous in-repo implementation on a Qwen3-4B IFEval-Ko output (342 samples): all strict_eval and loose_eval fields match byte-for-byte; prompt_strict=70.18, prompt_loose=73.39 unchanged. Signed-off-by: bzantium <ryumin93@gmail.com>
1 parent 982cf70 commit ab260a1

5 files changed

Lines changed: 116 additions & 0 deletions

File tree

dockerfiles/Dockerfile.nemo-skills

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ RUN pip install langdetect absl-py immutabledict nltk ipython && \
5353
python -c "import nltk; from spacy.cli import download; nltk.download('punkt'); nltk.download('punkt_tab'); \
5454
nltk.download('stopwords'); nltk.download('averaged_perceptron_tagger_eng'); download('en_core_web_sm')"
5555

56+
# koifeval (Korean adaptation of google-research/instruction_following_eval)
57+
RUN pip install --no-cache-dir "koifeval @ git+https://github.com/bzantium/koifeval.git@7a27889c0cf285e60b1a7b4795211846dc328aaf"
58+
5659
# we aren't copying main nemo_skills folder as it will always be mounted from host
5760
# but we do want to install all requirements in the container directly
5861
RUN mkdir -p /opt/NeMo-Skills/requirements /opt/NeMo-Skills/core
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# settings that define how evaluation should be done by default (all can be changed from cmdline)
16+
17+
METRICS_TYPE = "if"
18+
GENERATION_ARGS = "++prompt_config=generic/default ++generation_key=response ++eval_type=koifeval"
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
from pathlib import Path
17+
18+
from datasets import load_dataset
19+
20+
if __name__ == "__main__":
21+
data_dir = Path(__file__).absolute().parent
22+
data_dir.mkdir(exist_ok=True)
23+
output_file = str(data_dir / "test.jsonl")
24+
25+
data = []
26+
ds = load_dataset("allganize/IFEval-Ko", split="train")
27+
28+
for entry in ds:
29+
new_entry = entry
30+
new_entry["question"] = entry["prompt"]
31+
data.append(new_entry)
32+
33+
with open(output_file, "wt", encoding="utf-8") as fout:
34+
for entry in data:
35+
fout.write(json.dumps(entry, ensure_ascii=False) + "\n")

nemo_skills/evaluation/evaluator/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"evalplus": "nemo_skills.evaluation.evaluator.code:eval_evalplus",
3030
"if": "nemo_skills.evaluation.evaluator.ifeval:eval_if",
3131
"ifbench": "nemo_skills.evaluation.evaluator.ifbench:eval_ifbench",
32+
"koifeval": "nemo_skills.evaluation.evaluator.koifeval:eval_koif",
3233
"bfcl": "nemo_skills.evaluation.evaluator.bfcl:eval_bfcl",
3334
"multichoice": "nemo_skills.evaluation.evaluator.mcq:eval_mcq",
3435
"ruler": "nemo_skills.evaluation.evaluator.ruler:eval_ruler",
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import logging
17+
import shutil
18+
import subprocess
19+
from pathlib import Path
20+
21+
from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
22+
from nemo_skills.utils import get_logger_name
23+
24+
LOG = logging.getLogger(get_logger_name(__file__))
25+
26+
27+
def eval_koif(cfg):
28+
cfg = BaseEvaluatorConfig(**cfg)
29+
jsonl_file = cfg.input_file
30+
jsonl_path = Path(jsonl_file).resolve()
31+
output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
32+
output_dir.mkdir(parents=True, exist_ok=True)
33+
cmd = (
34+
"python -m koifeval.evaluation_main "
35+
f"--input_data={jsonl_file} "
36+
f"--input_response_data={jsonl_file} "
37+
f"--output_dir={output_dir} "
38+
)
39+
subprocess.run(cmd, shell=True, check=True)
40+
# fusing eval metrics back into the generation file
41+
with open(jsonl_file, "rt", encoding="utf-8") as f:
42+
samples = [json.loads(line) for line in f]
43+
44+
with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
45+
eval_results = [json.loads(line) for line in f]
46+
for sample, eval_result in zip(samples, eval_results):
47+
sample["loose_eval"] = eval_result
48+
49+
with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
50+
eval_results = [json.loads(line) for line in f]
51+
for sample, eval_result in zip(samples, eval_results):
52+
sample["strict_eval"] = eval_result
53+
54+
with open(jsonl_file, "wt", encoding="utf-8") as f:
55+
for sample in samples:
56+
f.write(json.dumps(sample) + "\n")
57+
58+
# removing temporary metric directory to avoid reusing it
59+
shutil.rmtree(output_dir)

0 commit comments

Comments
 (0)