THUDM · zhuzilin · Oct 29, 2025 · Oct 26, 2025 · Oct 26, 2025 · Oct 26, 2025
diff --git a/examples/eval_multi_task/README.md b/examples/eval_multi_task/README.md
@@ -0,0 +1,12 @@
+# Multi-Task Evaluation Example
+
+## Configuring `multi_task.yaml`
+- `eval.defaults` defines inference parameters shared by every dataset entry. Override them inside an individual dataset block if needed.
+- `eval.datasets` enumerates the datasets to evaluate. Each entry should specify:
+  - `name`: a short identifier that appears in logs and dashboards.
+  - `path`: the path to the dataset JSONL file.
+  - `rm_type`: which reward function to use for scoring.
+  - `n_samples_per_eval_prompt`: how many candidate completions to generate per prompt.
+
+## IFBench Notes
+- When `ifbench` is used, `slime/rollout/rm_hub/ifbench.py` will automatically prepares the scoring environment, so no additional manual setup is required beyond providing the dataset path.
diff --git a/examples/eval_multi_task/gpqa-dev.yaml b/examples/eval_multi_task/gpqa-dev.yaml
diff --git a/examples/eval_multi_task/gpqa-dev.sh → examples/eval_multi_task/multi_task.sh b/examples/eval_multi_task/gpqa-dev.sh → examples/eval_multi_task/multi_task.sh
@@ -26,7 +26,7 @@ echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." &>/dev/null && pwd)"
 source "${REPO_ROOT}/scripts/models/qwen3-4B.sh"
-EVAL_CONFIG_PATH="${REPO_ROOT}/examples/eval_multi_task/gpqa-dev.yaml"
+EVAL_CONFIG_PATH="${REPO_ROOT}/examples/eval_multi_task/multi_task.yaml"
 
 CKPT_ARGS=(
    --hf-checkpoint /root/Qwen3-4B
@@ -98,7 +98,7 @@ OPTIMIZER_ARGS=(
 WANDB_ARGS=(
    --use-wandb
    --wandb-project eval
-   --wandb-group gpqa
+   --wandb-group multi_task
    --wandb-key ${WANDB_KEY}
 )
 

diff --git a/examples/eval_multi_task/multi_task.yaml b/examples/eval_multi_task/multi_task.yaml
@@ -0,0 +1,17 @@
+eval:
+  defaults:
+    max_response_len: 16384
+    top_p: 0.7
+  datasets:
+    - name: aime
+      path: /root/aime-2024/aime-2024.jsonl
+      rm_type: deepscaler
+      n_samples_per_eval_prompt: 16
+    - name: gpqa    # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa
+      path: /root/gpqa/gpqa_eval.jsonl
+      rm_type: gpqa
+      n_samples_per_eval_prompt: 2
+    - name: ifbench   # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench
+      path: /root/ifbench/IFBench_eval.jsonl
+      rm_type: ifbench
+      n_samples_per_eval_prompt: 1
diff --git a/examples/eval_multi_task/requirements_ifbench.txt b/examples/eval_multi_task/requirements_ifbench.txt
@@ -0,0 +1,6 @@
+emoji
+nltk
+spacy==3.7.4
+syllapy
+numpy==1.26.4
+immutabledict
diff --git a/slime/rollout/rm_hub/__init__.py b/slime/rollout/rm_hub/__init__.py
@@ -9,6 +9,7 @@
 from .deepscaler import get_deepscaler_rule_based_reward
 from .f1 import f1_score
 from .gpqa import compute_gpqa_reward
+from .ifbench import compute_ifbench_reward
 from .math_dapo_utils import compute_score as compute_score_dapo
 from .math_utils import extract_answer as extract_boxed_answer
 from .math_utils import grade_answer_verl
@@ -54,6 +55,8 @@ async def async_rm(args, sample: Sample, **kwargs):
         return f1_score(response, label)[0]
     elif rm_type == "gpqa":
         return compute_gpqa_reward(response, label, metadata=metadata)
+    elif rm_type == "ifbench":
+        return compute_ifbench_reward(response, label, metadata=metadata)
     elif rm_type:
         raise NotImplementedError(f"Rule-based RM for {rm_type} is not implemented.")
     else:

diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import importlib
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+logger = logging.getLogger(__name__)
+
+_WORKSPACE_ROOT = Path(__file__).resolve().parents[3]
+_WORKSPACE_PARENT = _WORKSPACE_ROOT.parent
+_LOCAL_IFBENCH_REQUIREMENTS = _WORKSPACE_ROOT / "examples" / "eval_multi_task" / "requirements_ifbench.txt"
+
+
+def _ensure_ifbench_repo() -> Path:
+    """Clone IFBench repo if needed and ensure it is available on sys.path."""
+
+    repo_path = _WORKSPACE_PARENT / "IFBench"
+
+    if not repo_path.exists():
+        clone_cmd = ["git", "clone", "https://github.com/allenai/IFBench.git", str(repo_path)]
+        try:
+            subprocess.run(clone_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        except Exception as exc:
+            raise ImportError(
+                "Unable to automatically clone IFBench. Please clone "
+                "https://github.com/allenai/IFBench.git into the repo root."
+            ) from exc
+
+    repo_str = str(repo_path)
+    if repo_str not in sys.path:
+        sys.path.insert(0, repo_str)
+
+    current_pythonpath = os.environ.get("PYTHONPATH")
+    if current_pythonpath is None:
+        os.environ["PYTHONPATH"] = repo_str
+    elif repo_str not in current_pythonpath.split(os.pathsep):
+        os.environ["PYTHONPATH"] = os.pathsep.join([repo_str, current_pythonpath])
+
+    return repo_path
+
+
+def _ensure_ifbench_dependencies(repo_path: Path) -> None:
+    """Install IFBench requirements the first time the module is imported."""
+
+    requirements_file = _LOCAL_IFBENCH_REQUIREMENTS
+
+    if not requirements_file.exists():
+        logger.debug("Local IFBench requirements file not found at %s; skipping install.", requirements_file)
+        return
+
+    sentinel = repo_path / ".deps_installed"
+    if sentinel.exists():
+        return
+
+    install_cmd = [sys.executable, "-m", "pip", "install", "-r", str(requirements_file)]
+    try:
+        subprocess.run(install_cmd, check=True)
+    except Exception as exc:
+        logger.warning("Failed to install IFBench dependencies automatically: %s", exc)
+    else:
+        sentinel.write_text("installed\n")
+
+
+def _load_evaluation_lib():
+    repo_path = _ensure_ifbench_repo()
+    try:
+        return importlib.import_module("evaluation_lib")
+    except ImportError:
+        _ensure_ifbench_dependencies(repo_path)
+        return importlib.import_module("evaluation_lib")
+
+
+evaluation_lib = _load_evaluation_lib()
+InputExample = evaluation_lib.InputExample
+
+
+JsonDict = Dict[str, Any]
+KwargsDict = Dict[str, Optional[Union[str, int, float]]]
+
+
+def _normalize_instruction_ids(raw_ids: Sequence[Any]) -> List[str]:
+    """Ensure instruction identifiers are clean strings."""
+
+    normalized: List[str] = []
+    for entry in raw_ids or []:
+        if entry is None:
+            continue
+        text = str(entry).strip()
+        if not text:
+            continue
+        normalized.append(text)
+    return normalized
+
+
+def _coerce_kwargs_list(
+    raw_kwargs: Any,
+    num_instructions: int,
+) -> List[KwargsDict]:
+    """Convert stored kwargs into the list structure expected by IFBench."""
+
+    if isinstance(raw_kwargs, list):
+        processed: List[KwargsDict] = []
+        for entry in raw_kwargs:
+            if isinstance(entry, dict):
+                processed.append(dict(entry))
+            else:
+                processed.append({})
+    elif isinstance(raw_kwargs, dict):
+        processed = [dict(raw_kwargs) for _ in range(num_instructions)]
+    else:
+        processed = [{} for _ in range(num_instructions)]
+
+    if len(processed) < num_instructions:
+        tail = processed[-1] if processed else {}
+        processed.extend([dict(tail) for _ in range(num_instructions - len(processed))])
+    elif len(processed) > num_instructions:
+        processed = processed[:num_instructions]
+
+    # Remove explicit None values to match official preprocessing.
+    sanitized: List[KwargsDict] = []
+    for entry in processed:
+        sanitized.append({k: v for k, v in entry.items() if v is not None})
+    return sanitized
+
+
+def _build_input_example(metadata: JsonDict) -> Optional[InputExample]:
+    instruction_ids = _normalize_instruction_ids(metadata.get("instruction_id_list") or [])
+    if not instruction_ids:
+        logger.debug("Missing instruction identifiers in metadata: %s", metadata)
+        return None
+
+    prompt_text = metadata.get("prompt_text")
+    if prompt_text is None:
+        prompt_text = ""
+    else:
+        prompt_text = str(prompt_text)
+
+    raw_kwargs = metadata.get("kwargs")
+    kwargs_list = _coerce_kwargs_list(raw_kwargs, len(instruction_ids))
+
+    return InputExample(
+        key=int(metadata.get("record_id") or 0),
+        instruction_id_list=instruction_ids,
+        prompt=prompt_text,
+        kwargs=kwargs_list,
+    )
+
+
+def compute_ifbench_reward(response: str, label: Any, metadata: Optional[JsonDict] = None) -> float:
+    """Score a model response using the official IFBench rules."""
+
+    if metadata is None:
+        logger.debug("No metadata provided for IFBench scoring.")
+        return 0.0
+
+    if response is None:
+        return 0.0
+
+    inp = _build_input_example(metadata)
+    if inp is None:
+        return 0.0
+
+    prompt_to_response = {inp.prompt: str(response or "")}
+    output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response)
+    return 1.0 if output.follow_all_instructions else 0.0