Skip to content

Commit dd2887b

Browse files
zyzshishuiguapisoloroot
authored
Support OOD Eval Tasks (GPQA, IFBench) (THUDM#597)
Co-authored-by: Jiajun Li <guapisolo@gmail.com> Co-authored-by: root <root@g305.voltagepark.net>
1 parent 649c9c3 commit dd2887b

File tree

2 files changed

+172
-0
lines changed

2 files changed

+172
-0
lines changed

slime/rollout/rm_hub/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from .deepscaler import get_deepscaler_rule_based_reward
1010
from .f1 import f1_score
1111
from .gpqa import compute_gpqa_reward
12+
from .ifbench import compute_ifbench_reward
1213
from .math_dapo_utils import compute_score as compute_score_dapo
1314
from .math_utils import extract_answer as extract_boxed_answer
1415
from .math_utils import grade_answer_verl
@@ -54,6 +55,8 @@ async def async_rm(args, sample: Sample, **kwargs):
5455
return f1_score(response, label)[0]
5556
elif rm_type == "gpqa":
5657
return compute_gpqa_reward(response, label, metadata=metadata)
58+
elif rm_type == "ifbench":
59+
return compute_ifbench_reward(response, label, metadata=metadata)
5760
elif rm_type:
5861
raise NotImplementedError(f"Rule-based RM for {rm_type} is not implemented.")
5962
else:

slime/rollout/rm_hub/ifbench.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
from __future__ import annotations
2+
3+
import importlib
4+
import logging
5+
import os
6+
import subprocess
7+
import sys
8+
from pathlib import Path
9+
from typing import Any, Dict, List, Optional, Sequence, Union
10+
11+
logger = logging.getLogger(__name__)
12+
13+
_WORKSPACE_ROOT = Path(__file__).resolve().parents[3]
14+
_WORKSPACE_PARENT = _WORKSPACE_ROOT.parent
15+
_LOCAL_IFBENCH_REQUIREMENTS = _WORKSPACE_ROOT / "examples" / "eval_multi_task" / "requirements_ifbench.txt"
16+
17+
18+
def _ensure_ifbench_repo() -> Path:
19+
"""Clone IFBench repo if needed and ensure it is available on sys.path."""
20+
21+
repo_path = _WORKSPACE_PARENT / "IFBench"
22+
23+
if not repo_path.exists():
24+
clone_cmd = ["git", "clone", "https://github.com/allenai/IFBench.git", str(repo_path)]
25+
try:
26+
subprocess.run(clone_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
27+
except Exception as exc:
28+
raise ImportError(
29+
"Unable to automatically clone IFBench. Please clone "
30+
"https://github.com/allenai/IFBench.git into the repo root."
31+
) from exc
32+
33+
repo_str = str(repo_path)
34+
if repo_str not in sys.path:
35+
sys.path.insert(0, repo_str)
36+
37+
current_pythonpath = os.environ.get("PYTHONPATH")
38+
if current_pythonpath is None:
39+
os.environ["PYTHONPATH"] = repo_str
40+
elif repo_str not in current_pythonpath.split(os.pathsep):
41+
os.environ["PYTHONPATH"] = os.pathsep.join([repo_str, current_pythonpath])
42+
43+
return repo_path
44+
45+
46+
def _ensure_ifbench_dependencies(repo_path: Path) -> None:
47+
"""Install IFBench requirements the first time the module is imported."""
48+
49+
requirements_file = _LOCAL_IFBENCH_REQUIREMENTS
50+
51+
if not requirements_file.exists():
52+
logger.debug("Local IFBench requirements file not found at %s; skipping install.", requirements_file)
53+
return
54+
55+
sentinel = repo_path / ".deps_installed"
56+
if sentinel.exists():
57+
return
58+
59+
install_cmd = [sys.executable, "-m", "pip", "install", "-r", str(requirements_file)]
60+
try:
61+
subprocess.run(install_cmd, check=True)
62+
except Exception as exc:
63+
logger.warning("Failed to install IFBench dependencies automatically: %s", exc)
64+
else:
65+
sentinel.write_text("installed\n")
66+
67+
68+
def _load_evaluation_lib():
69+
repo_path = _ensure_ifbench_repo()
70+
try:
71+
return importlib.import_module("evaluation_lib")
72+
except ImportError:
73+
_ensure_ifbench_dependencies(repo_path)
74+
return importlib.import_module("evaluation_lib")
75+
76+
77+
evaluation_lib = _load_evaluation_lib()
78+
InputExample = evaluation_lib.InputExample
79+
80+
81+
JsonDict = Dict[str, Any]
82+
KwargsDict = Dict[str, Optional[Union[str, int, float]]]
83+
84+
85+
def _normalize_instruction_ids(raw_ids: Sequence[Any]) -> List[str]:
86+
"""Ensure instruction identifiers are clean strings."""
87+
88+
normalized: List[str] = []
89+
for entry in raw_ids or []:
90+
if entry is None:
91+
continue
92+
text = str(entry).strip()
93+
if not text:
94+
continue
95+
normalized.append(text)
96+
return normalized
97+
98+
99+
def _coerce_kwargs_list(
100+
raw_kwargs: Any,
101+
num_instructions: int,
102+
) -> List[KwargsDict]:
103+
"""Convert stored kwargs into the list structure expected by IFBench."""
104+
105+
if isinstance(raw_kwargs, list):
106+
processed: List[KwargsDict] = []
107+
for entry in raw_kwargs:
108+
if isinstance(entry, dict):
109+
processed.append(dict(entry))
110+
else:
111+
processed.append({})
112+
elif isinstance(raw_kwargs, dict):
113+
processed = [dict(raw_kwargs) for _ in range(num_instructions)]
114+
else:
115+
processed = [{} for _ in range(num_instructions)]
116+
117+
if len(processed) < num_instructions:
118+
tail = processed[-1] if processed else {}
119+
processed.extend([dict(tail) for _ in range(num_instructions - len(processed))])
120+
elif len(processed) > num_instructions:
121+
processed = processed[:num_instructions]
122+
123+
# Remove explicit None values to match official preprocessing.
124+
sanitized: List[KwargsDict] = []
125+
for entry in processed:
126+
sanitized.append({k: v for k, v in entry.items() if v is not None})
127+
return sanitized
128+
129+
130+
def _build_input_example(metadata: JsonDict) -> Optional[InputExample]:
131+
instruction_ids = _normalize_instruction_ids(metadata.get("instruction_id_list") or [])
132+
if not instruction_ids:
133+
logger.debug("Missing instruction identifiers in metadata: %s", metadata)
134+
return None
135+
136+
prompt_text = metadata.get("prompt_text")
137+
if prompt_text is None:
138+
prompt_text = ""
139+
else:
140+
prompt_text = str(prompt_text)
141+
142+
raw_kwargs = metadata.get("kwargs")
143+
kwargs_list = _coerce_kwargs_list(raw_kwargs, len(instruction_ids))
144+
145+
return InputExample(
146+
key=int(metadata.get("record_id") or 0),
147+
instruction_id_list=instruction_ids,
148+
prompt=prompt_text,
149+
kwargs=kwargs_list,
150+
)
151+
152+
153+
def compute_ifbench_reward(response: str, label: Any, metadata: Optional[JsonDict] = None) -> float:
154+
"""Score a model response using the official IFBench rules."""
155+
156+
if metadata is None:
157+
logger.debug("No metadata provided for IFBench scoring.")
158+
return 0.0
159+
160+
if response is None:
161+
return 0.0
162+
163+
inp = _build_input_example(metadata)
164+
if inp is None:
165+
return 0.0
166+
167+
prompt_to_response = {inp.prompt: str(response or "")}
168+
output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response)
169+
return 1.0 if output.follow_all_instructions else 0.0

0 commit comments

Comments
 (0)