|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import importlib |
| 4 | +import logging |
| 5 | +import os |
| 6 | +import subprocess |
| 7 | +import sys |
| 8 | +from pathlib import Path |
| 9 | +from typing import Any, Dict, List, Optional, Sequence, Union |
| 10 | + |
| 11 | +logger = logging.getLogger(__name__) |
| 12 | + |
| 13 | +_WORKSPACE_ROOT = Path(__file__).resolve().parents[3] |
| 14 | +_WORKSPACE_PARENT = _WORKSPACE_ROOT.parent |
| 15 | +_LOCAL_IFBENCH_REQUIREMENTS = _WORKSPACE_ROOT / "examples" / "eval_multi_task" / "requirements_ifbench.txt" |
| 16 | + |
| 17 | + |
| 18 | +def _ensure_ifbench_repo() -> Path: |
| 19 | + """Clone IFBench repo if needed and ensure it is available on sys.path.""" |
| 20 | + |
| 21 | + repo_path = _WORKSPACE_PARENT / "IFBench" |
| 22 | + |
| 23 | + if not repo_path.exists(): |
| 24 | + clone_cmd = ["git", "clone", "https://github.com/allenai/IFBench.git", str(repo_path)] |
| 25 | + try: |
| 26 | + subprocess.run(clone_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| 27 | + except Exception as exc: |
| 28 | + raise ImportError( |
| 29 | + "Unable to automatically clone IFBench. Please clone " |
| 30 | + "https://github.com/allenai/IFBench.git into the repo root." |
| 31 | + ) from exc |
| 32 | + |
| 33 | + repo_str = str(repo_path) |
| 34 | + if repo_str not in sys.path: |
| 35 | + sys.path.insert(0, repo_str) |
| 36 | + |
| 37 | + current_pythonpath = os.environ.get("PYTHONPATH") |
| 38 | + if current_pythonpath is None: |
| 39 | + os.environ["PYTHONPATH"] = repo_str |
| 40 | + elif repo_str not in current_pythonpath.split(os.pathsep): |
| 41 | + os.environ["PYTHONPATH"] = os.pathsep.join([repo_str, current_pythonpath]) |
| 42 | + |
| 43 | + return repo_path |
| 44 | + |
| 45 | + |
| 46 | +def _ensure_ifbench_dependencies(repo_path: Path) -> None: |
| 47 | + """Install IFBench requirements the first time the module is imported.""" |
| 48 | + |
| 49 | + requirements_file = _LOCAL_IFBENCH_REQUIREMENTS |
| 50 | + |
| 51 | + if not requirements_file.exists(): |
| 52 | + logger.debug("Local IFBench requirements file not found at %s; skipping install.", requirements_file) |
| 53 | + return |
| 54 | + |
| 55 | + sentinel = repo_path / ".deps_installed" |
| 56 | + if sentinel.exists(): |
| 57 | + return |
| 58 | + |
| 59 | + install_cmd = [sys.executable, "-m", "pip", "install", "-r", str(requirements_file)] |
| 60 | + try: |
| 61 | + subprocess.run(install_cmd, check=True) |
| 62 | + except Exception as exc: |
| 63 | + logger.warning("Failed to install IFBench dependencies automatically: %s", exc) |
| 64 | + else: |
| 65 | + sentinel.write_text("installed\n") |
| 66 | + |
| 67 | + |
| 68 | +def _load_evaluation_lib(): |
| 69 | + repo_path = _ensure_ifbench_repo() |
| 70 | + try: |
| 71 | + return importlib.import_module("evaluation_lib") |
| 72 | + except ImportError: |
| 73 | + _ensure_ifbench_dependencies(repo_path) |
| 74 | + return importlib.import_module("evaluation_lib") |
| 75 | + |
| 76 | + |
| 77 | +evaluation_lib = _load_evaluation_lib() |
| 78 | +InputExample = evaluation_lib.InputExample |
| 79 | + |
| 80 | + |
| 81 | +JsonDict = Dict[str, Any] |
| 82 | +KwargsDict = Dict[str, Optional[Union[str, int, float]]] |
| 83 | + |
| 84 | + |
| 85 | +def _normalize_instruction_ids(raw_ids: Sequence[Any]) -> List[str]: |
| 86 | + """Ensure instruction identifiers are clean strings.""" |
| 87 | + |
| 88 | + normalized: List[str] = [] |
| 89 | + for entry in raw_ids or []: |
| 90 | + if entry is None: |
| 91 | + continue |
| 92 | + text = str(entry).strip() |
| 93 | + if not text: |
| 94 | + continue |
| 95 | + normalized.append(text) |
| 96 | + return normalized |
| 97 | + |
| 98 | + |
| 99 | +def _coerce_kwargs_list( |
| 100 | + raw_kwargs: Any, |
| 101 | + num_instructions: int, |
| 102 | +) -> List[KwargsDict]: |
| 103 | + """Convert stored kwargs into the list structure expected by IFBench.""" |
| 104 | + |
| 105 | + if isinstance(raw_kwargs, list): |
| 106 | + processed: List[KwargsDict] = [] |
| 107 | + for entry in raw_kwargs: |
| 108 | + if isinstance(entry, dict): |
| 109 | + processed.append(dict(entry)) |
| 110 | + else: |
| 111 | + processed.append({}) |
| 112 | + elif isinstance(raw_kwargs, dict): |
| 113 | + processed = [dict(raw_kwargs) for _ in range(num_instructions)] |
| 114 | + else: |
| 115 | + processed = [{} for _ in range(num_instructions)] |
| 116 | + |
| 117 | + if len(processed) < num_instructions: |
| 118 | + tail = processed[-1] if processed else {} |
| 119 | + processed.extend([dict(tail) for _ in range(num_instructions - len(processed))]) |
| 120 | + elif len(processed) > num_instructions: |
| 121 | + processed = processed[:num_instructions] |
| 122 | + |
| 123 | + # Remove explicit None values to match official preprocessing. |
| 124 | + sanitized: List[KwargsDict] = [] |
| 125 | + for entry in processed: |
| 126 | + sanitized.append({k: v for k, v in entry.items() if v is not None}) |
| 127 | + return sanitized |
| 128 | + |
| 129 | + |
| 130 | +def _build_input_example(metadata: JsonDict) -> Optional[InputExample]: |
| 131 | + instruction_ids = _normalize_instruction_ids(metadata.get("instruction_id_list") or []) |
| 132 | + if not instruction_ids: |
| 133 | + logger.debug("Missing instruction identifiers in metadata: %s", metadata) |
| 134 | + return None |
| 135 | + |
| 136 | + prompt_text = metadata.get("prompt_text") |
| 137 | + if prompt_text is None: |
| 138 | + prompt_text = "" |
| 139 | + else: |
| 140 | + prompt_text = str(prompt_text) |
| 141 | + |
| 142 | + raw_kwargs = metadata.get("kwargs") |
| 143 | + kwargs_list = _coerce_kwargs_list(raw_kwargs, len(instruction_ids)) |
| 144 | + |
| 145 | + return InputExample( |
| 146 | + key=int(metadata.get("record_id") or 0), |
| 147 | + instruction_id_list=instruction_ids, |
| 148 | + prompt=prompt_text, |
| 149 | + kwargs=kwargs_list, |
| 150 | + ) |
| 151 | + |
| 152 | + |
| 153 | +def compute_ifbench_reward(response: str, label: Any, metadata: Optional[JsonDict] = None) -> float: |
| 154 | + """Score a model response using the official IFBench rules.""" |
| 155 | + |
| 156 | + if metadata is None: |
| 157 | + logger.debug("No metadata provided for IFBench scoring.") |
| 158 | + return 0.0 |
| 159 | + |
| 160 | + if response is None: |
| 161 | + return 0.0 |
| 162 | + |
| 163 | + inp = _build_input_example(metadata) |
| 164 | + if inp is None: |
| 165 | + return 0.0 |
| 166 | + |
| 167 | + prompt_to_response = {inp.prompt: str(response or "")} |
| 168 | + output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response) |
| 169 | + return 1.0 if output.follow_all_instructions else 0.0 |
0 commit comments