diff --git a/examples/eval_multi_task/README.md b/examples/eval_multi_task/README.md new file mode 100644 index 000000000..0bf61ec72 --- /dev/null +++ b/examples/eval_multi_task/README.md @@ -0,0 +1,12 @@ +# Multi-Task Evaluation Example + +## Configuring `multi_task.yaml` +- `eval.defaults` defines inference parameters shared by every dataset entry. Override them inside an individual dataset block if needed. +- `eval.datasets` enumerates the datasets to evaluate. Each entry should specify: + - `name`: a short identifier that appears in logs and dashboards. + - `path`: the path to the dataset JSONL file. + - `rm_type`: which reward function to use for scoring. + - `n_samples_per_eval_prompt`: how many candidate completions to generate per prompt. + +## IFBench Notes +- When `ifbench` is used, `slime/rollout/rm_hub/ifbench.py` will automatically prepares the scoring environment, so no additional manual setup is required beyond providing the dataset path. diff --git a/examples/eval_multi_task/gpqa-dev.yaml b/examples/eval_multi_task/gpqa-dev.yaml deleted file mode 100644 index 2592b93b6..000000000 --- a/examples/eval_multi_task/gpqa-dev.yaml +++ /dev/null @@ -1,12 +0,0 @@ -eval: - defaults: - n_samples_per_eval_prompt: 2 - max_response_len: 16384 - top_p: 0.7 - datasets: - - name: aime - path: /root/aime-2024/aime-2024.jsonl - rm_type: deepscaler - - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa - path: /root/gpqa/gpqa_eval.jsonl - rm_type: gpqa diff --git a/examples/eval_multi_task/gpqa-dev.sh b/examples/eval_multi_task/multi_task.sh similarity index 97% rename from examples/eval_multi_task/gpqa-dev.sh rename to examples/eval_multi_task/multi_task.sh index 8591fe394..9a69dcbc1 100644 --- a/examples/eval_multi_task/gpqa-dev.sh +++ b/examples/eval_multi_task/multi_task.sh @@ -26,7 +26,7 @@ echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." &>/dev/null && pwd)" source "${REPO_ROOT}/scripts/models/qwen3-4B.sh" -EVAL_CONFIG_PATH="${REPO_ROOT}/examples/eval_multi_task/gpqa-dev.yaml" +EVAL_CONFIG_PATH="${REPO_ROOT}/examples/eval_multi_task/multi_task.yaml" CKPT_ARGS=( --hf-checkpoint /root/Qwen3-4B @@ -98,7 +98,7 @@ OPTIMIZER_ARGS=( WANDB_ARGS=( --use-wandb --wandb-project eval - --wandb-group gpqa + --wandb-group multi_task --wandb-key ${WANDB_KEY} ) diff --git a/examples/eval_multi_task/multi_task.yaml b/examples/eval_multi_task/multi_task.yaml new file mode 100644 index 000000000..83ae67f8a --- /dev/null +++ b/examples/eval_multi_task/multi_task.yaml @@ -0,0 +1,17 @@ +eval: + defaults: + max_response_len: 16384 + top_p: 0.7 + datasets: + - name: aime + path: /root/aime-2024/aime-2024.jsonl + rm_type: deepscaler + n_samples_per_eval_prompt: 16 + - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa + path: /root/gpqa/gpqa_eval.jsonl + rm_type: gpqa + n_samples_per_eval_prompt: 2 + - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench + path: /root/ifbench/IFBench_eval.jsonl + rm_type: ifbench + n_samples_per_eval_prompt: 1 diff --git a/examples/eval_multi_task/requirements_ifbench.txt b/examples/eval_multi_task/requirements_ifbench.txt new file mode 100644 index 000000000..4e9b607fb --- /dev/null +++ b/examples/eval_multi_task/requirements_ifbench.txt @@ -0,0 +1,6 @@ +emoji +nltk +spacy==3.7.4 +syllapy +numpy==1.26.4 +immutabledict \ No newline at end of file diff --git a/slime/rollout/rm_hub/__init__.py b/slime/rollout/rm_hub/__init__.py index 5395da095..cb5245b0a 100644 --- a/slime/rollout/rm_hub/__init__.py +++ b/slime/rollout/rm_hub/__init__.py @@ -9,6 +9,7 @@ from .deepscaler import get_deepscaler_rule_based_reward from .f1 import f1_score from .gpqa import compute_gpqa_reward +from .ifbench import compute_ifbench_reward from .math_dapo_utils import compute_score as compute_score_dapo from .math_utils import extract_answer as extract_boxed_answer from .math_utils import grade_answer_verl @@ -54,6 +55,8 @@ async def async_rm(args, sample: Sample, **kwargs): return f1_score(response, label)[0] elif rm_type == "gpqa": return compute_gpqa_reward(response, label, metadata=metadata) + elif rm_type == "ifbench": + return compute_ifbench_reward(response, label, metadata=metadata) elif rm_type: raise NotImplementedError(f"Rule-based RM for {rm_type} is not implemented.") else: diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py new file mode 100644 index 000000000..39d083e70 --- /dev/null +++ b/slime/rollout/rm_hub/ifbench.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +import importlib +import logging +import os +import subprocess +import sys +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Union + +logger = logging.getLogger(__name__) + +_WORKSPACE_ROOT = Path(__file__).resolve().parents[3] +_WORKSPACE_PARENT = _WORKSPACE_ROOT.parent +_LOCAL_IFBENCH_REQUIREMENTS = _WORKSPACE_ROOT / "examples" / "eval_multi_task" / "requirements_ifbench.txt" + + +def _ensure_ifbench_repo() -> Path: + """Clone IFBench repo if needed and ensure it is available on sys.path.""" + + repo_path = _WORKSPACE_PARENT / "IFBench" + + if not repo_path.exists(): + clone_cmd = ["git", "clone", "https://github.com/allenai/IFBench.git", str(repo_path)] + try: + subprocess.run(clone_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as exc: + raise ImportError( + "Unable to automatically clone IFBench. Please clone " + "https://github.com/allenai/IFBench.git into the repo root." + ) from exc + + repo_str = str(repo_path) + if repo_str not in sys.path: + sys.path.insert(0, repo_str) + + current_pythonpath = os.environ.get("PYTHONPATH") + if current_pythonpath is None: + os.environ["PYTHONPATH"] = repo_str + elif repo_str not in current_pythonpath.split(os.pathsep): + os.environ["PYTHONPATH"] = os.pathsep.join([repo_str, current_pythonpath]) + + return repo_path + + +def _ensure_ifbench_dependencies(repo_path: Path) -> None: + """Install IFBench requirements the first time the module is imported.""" + + requirements_file = _LOCAL_IFBENCH_REQUIREMENTS + + if not requirements_file.exists(): + logger.debug("Local IFBench requirements file not found at %s; skipping install.", requirements_file) + return + + sentinel = repo_path / ".deps_installed" + if sentinel.exists(): + return + + install_cmd = [sys.executable, "-m", "pip", "install", "-r", str(requirements_file)] + try: + subprocess.run(install_cmd, check=True) + except Exception as exc: + logger.warning("Failed to install IFBench dependencies automatically: %s", exc) + else: + sentinel.write_text("installed\n") + + +def _load_evaluation_lib(): + repo_path = _ensure_ifbench_repo() + try: + return importlib.import_module("evaluation_lib") + except ImportError: + _ensure_ifbench_dependencies(repo_path) + return importlib.import_module("evaluation_lib") + + +evaluation_lib = _load_evaluation_lib() +InputExample = evaluation_lib.InputExample + + +JsonDict = Dict[str, Any] +KwargsDict = Dict[str, Optional[Union[str, int, float]]] + + +def _normalize_instruction_ids(raw_ids: Sequence[Any]) -> List[str]: + """Ensure instruction identifiers are clean strings.""" + + normalized: List[str] = [] + for entry in raw_ids or []: + if entry is None: + continue + text = str(entry).strip() + if not text: + continue + normalized.append(text) + return normalized + + +def _coerce_kwargs_list( + raw_kwargs: Any, + num_instructions: int, +) -> List[KwargsDict]: + """Convert stored kwargs into the list structure expected by IFBench.""" + + if isinstance(raw_kwargs, list): + processed: List[KwargsDict] = [] + for entry in raw_kwargs: + if isinstance(entry, dict): + processed.append(dict(entry)) + else: + processed.append({}) + elif isinstance(raw_kwargs, dict): + processed = [dict(raw_kwargs) for _ in range(num_instructions)] + else: + processed = [{} for _ in range(num_instructions)] + + if len(processed) < num_instructions: + tail = processed[-1] if processed else {} + processed.extend([dict(tail) for _ in range(num_instructions - len(processed))]) + elif len(processed) > num_instructions: + processed = processed[:num_instructions] + + # Remove explicit None values to match official preprocessing. + sanitized: List[KwargsDict] = [] + for entry in processed: + sanitized.append({k: v for k, v in entry.items() if v is not None}) + return sanitized + + +def _build_input_example(metadata: JsonDict) -> Optional[InputExample]: + instruction_ids = _normalize_instruction_ids(metadata.get("instruction_id_list") or []) + if not instruction_ids: + logger.debug("Missing instruction identifiers in metadata: %s", metadata) + return None + + prompt_text = metadata.get("prompt_text") + if prompt_text is None: + prompt_text = "" + else: + prompt_text = str(prompt_text) + + raw_kwargs = metadata.get("kwargs") + kwargs_list = _coerce_kwargs_list(raw_kwargs, len(instruction_ids)) + + return InputExample( + key=int(metadata.get("record_id") or 0), + instruction_id_list=instruction_ids, + prompt=prompt_text, + kwargs=kwargs_list, + ) + + +def compute_ifbench_reward(response: str, label: Any, metadata: Optional[JsonDict] = None) -> float: + """Score a model response using the official IFBench rules.""" + + if metadata is None: + logger.debug("No metadata provided for IFBench scoring.") + return 0.0 + + if response is None: + return 0.0 + + inp = _build_input_example(metadata) + if inp is None: + return 0.0 + + prompt_to_response = {inp.prompt: str(response or "")} + output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response) + return 1.0 if output.follow_all_instructions else 0.0