feat: add KoIFEval Korean instruction-following benchmark

bzantium · bzantium · commit ab260a159d06 · 2026-05-07T11:09:55.000+09:00
Add allganize/IFEval-Ko (342 samples) as a Korean instruction-following benchmark. Mirrors the structure of the existing English ifeval setup. The actual evaluator (Korean instruction checkers, sentence splitting, language detection, etc.) lives in a separate pip-installable package: https://github.com/bzantium/koifeval (Apache-2.0, derived from google-research/instruction_following_eval and allganize/IFEval-Ko, with attribution in LICENSE and NOTICE). Changes here: - nemo_skills/dataset/koifeval/: dataset prepare and __init__ (METRICS_TYPE: if). - nemo_skills/evaluation/evaluator/koifeval.py: thin subprocess wrapper that invokes 'python -m koifeval.evaluation_main' and merges strict / loose results back into the generation jsonl. Mirrors ifeval.py byte-for-byte except for the cmd line. - nemo_skills/evaluation/evaluator/__init__.py: register the koifeval eval_type. - dockerfiles/Dockerfile.nemo-skills: pip install koifeval pinned to commit 7a27889c0cf285e60b1a7b4795211846dc328aaf. Validated parity with the previous in-repo implementation on a Qwen3-4B IFEval-Ko output (342 samples): all strict_eval and loose_eval fields match byte-for-byte; prompt_strict=70.18, prompt_loose=73.39 unchanged. Signed-off-by: bzantium <ryumin93@gmail.com>
diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills
@@ -53,6 +53,9 @@ RUN pip install langdetect absl-py immutabledict nltk ipython && \
     python -c "import nltk; from spacy.cli import download; nltk.download('punkt'); nltk.download('punkt_tab'); \
     nltk.download('stopwords'); nltk.download('averaged_perceptron_tagger_eng'); download('en_core_web_sm')"
 
+# koifeval (Korean adaptation of google-research/instruction_following_eval)
+RUN pip install --no-cache-dir "koifeval @ git+https://github.com/bzantium/koifeval.git@7a27889c0cf285e60b1a7b4795211846dc328aaf"
+
 # we aren't copying main nemo_skills folder as it will always be mounted from host
 # but we do want to install all requirements in the container directly
 RUN mkdir -p /opt/NeMo-Skills/requirements /opt/NeMo-Skills/core
diff --git a/nemo_skills/dataset/koifeval/__init__.py b/nemo_skills/dataset/koifeval/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+
+METRICS_TYPE = "if"
+GENERATION_ARGS = "++prompt_config=generic/default ++generation_key=response ++eval_type=koifeval"
diff --git a/nemo_skills/dataset/koifeval/prepare.py b/nemo_skills/dataset/koifeval/prepare.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+from datasets import load_dataset
+
+if __name__ == "__main__":
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+    output_file = str(data_dir / "test.jsonl")
+
+    data = []
+    ds = load_dataset("allganize/IFEval-Ko", split="train")
+
+    for entry in ds:
+        new_entry = entry
+        new_entry["question"] = entry["prompt"]
+        data.append(new_entry)
+
+    with open(output_file, "wt", encoding="utf-8") as fout:
+        for entry in data:
+            fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -29,6 +29,7 @@
     "evalplus": "nemo_skills.evaluation.evaluator.code:eval_evalplus",
     "if": "nemo_skills.evaluation.evaluator.ifeval:eval_if",
     "ifbench": "nemo_skills.evaluation.evaluator.ifbench:eval_ifbench",
+    "koifeval": "nemo_skills.evaluation.evaluator.koifeval:eval_koif",
     "bfcl": "nemo_skills.evaluation.evaluator.bfcl:eval_bfcl",
     "multichoice": "nemo_skills.evaluation.evaluator.mcq:eval_mcq",
     "ruler": "nemo_skills.evaluation.evaluator.ruler:eval_ruler",
diff --git a/nemo_skills/evaluation/evaluator/koifeval.py b/nemo_skills/evaluation/evaluator/koifeval.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import shutil
+import subprocess
+from pathlib import Path
+
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.utils import get_logger_name
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+def eval_koif(cfg):
+    cfg = BaseEvaluatorConfig(**cfg)
+    jsonl_file = cfg.input_file
+    jsonl_path = Path(jsonl_file).resolve()
+    output_dir = jsonl_path.parent / f"{jsonl_path.stem}_metrics_tmp"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cmd = (
+        "python -m koifeval.evaluation_main "
+        f"--input_data={jsonl_file} "
+        f"--input_response_data={jsonl_file} "
+        f"--output_dir={output_dir} "
+    )
+    subprocess.run(cmd, shell=True, check=True)
+    # fusing eval metrics back into the generation file
+    with open(jsonl_file, "rt", encoding="utf-8") as f:
+        samples = [json.loads(line) for line in f]
+
+    with open(output_dir / "eval_results_loose.jsonl", "rt", encoding="utf-8") as f:
+        eval_results = [json.loads(line) for line in f]
+    for sample, eval_result in zip(samples, eval_results):
+        sample["loose_eval"] = eval_result
+
+    with open(output_dir / "eval_results_strict.jsonl", "rt", encoding="utf-8") as f:
+        eval_results = [json.loads(line) for line in f]
+    for sample, eval_result in zip(samples, eval_results):
+        sample["strict_eval"] = eval_result
+
+    with open(jsonl_file, "wt", encoding="utf-8") as f:
+        for sample in samples:
+            f.write(json.dumps(sample) + "\n")
+
+    # removing temporary metric directory to avoid reusing it
+    shutil.rmtree(output_dir)