Skip to content

Commit aa7e66a

Browse files
Add nemo skills evaluation (#989)
Co-authored-by: miles.pr.bot <miles.pr.bot@gmail.com>
1 parent f600569 commit aa7e66a

File tree

15 files changed

+1256
-10
lines changed

15 files changed

+1256
-10
lines changed

examples/eval/README.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Docs
2+
3+
## Prerequisites
4+
- A writable host directory for cached data (`/data/.cache`)
5+
- Choose descriptive container names to replace the placeholders (`<slime container name>`, `<env container name>`).
6+
7+
## 1) Prepare host network
8+
```bash
9+
docker network create skills-net
10+
```
11+
12+
## 2) Launch the slime container
13+
```bash
14+
docker run \
15+
-itd \
16+
--shm-size 32g \
17+
--gpus all \
18+
-v /data/.cache:/root/.cache \
19+
-v /dev/shm:/shm \
20+
--ipc=host \
21+
--privileged \
22+
--network skills-net \
23+
--name <slime container name> \
24+
slimerl/slime:latest \
25+
/bin/bash
26+
```
27+
28+
## 3) Launch the Skills container
29+
```bash
30+
docker run \
31+
-itd \
32+
--shm-size 32g \
33+
--gpus all \
34+
-v /data/.cache:/root/.cache \
35+
-v /dev/shm:/shm \
36+
--ipc=host \
37+
--privileged \
38+
--network skills-net \
39+
--name <env container name> \
40+
--network-alias skills_server \
41+
guapisolo/nemoskills:0.7.1 \
42+
/bin/bash
43+
```
44+
45+
## 4) Inside the Skills container
46+
Clone repos and install the Skills package:
47+
```bash
48+
git clone -b slime_skills https://github.com/guapisolo/slime.git /opt/slime
49+
git clone -b slime https://github.com/guapisolo/Skills.git /opt/Skills
50+
51+
cd /opt/Skills
52+
pip install -e .
53+
```
54+
55+
Download/prepare datasets:
56+
```bash
57+
cd /opt/Skills/nemo_skills/dataset
58+
python3 aime25/prepare.py
59+
python3 hle/prepare.py
60+
python3 arena-hard/prepare.py
61+
```
62+
63+
Start the skills server:
64+
```bash
65+
cd /opt/slime
66+
python examples/eval/nemo_skills/skills_server.py \
67+
--host 0.0.0.0 \
68+
--port 9050 \
69+
--output-root /opt/skills-eval \
70+
--config-dir examples/eval/nemo_skills/config \
71+
--cluster local_cluster \
72+
--max-concurrent-requests 512 \
73+
--openai-model-name slime-openai-model
74+
```
75+
76+
You can now connect to the server at `skills_server:9050` from within the `skills-net` Docker network. The server always proxies evaluation traffic to an OpenAI-compatible sglang router (Slime starts and manage the router), so adjust `--openai-model-name` and `--max-concurrent-requests` as needed for your deployment.

examples/eval/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Evaluation helpers and example configs."""

examples/eval/eval_delegate.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
import logging
2+
from dataclasses import dataclass, field, fields
3+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence
4+
5+
from omegaconf import OmegaConf
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
def _first_not_none(*values: Any) -> Any:
11+
for value in values:
12+
if value is not None:
13+
return value
14+
return None
15+
16+
17+
def _pick_from_mapping(data: Optional[Mapping[str, Any]], keys: Iterable[str]) -> Any:
18+
if not data:
19+
return None
20+
for key in keys:
21+
if key in data and data[key] is not None:
22+
return data[key]
23+
return None
24+
25+
26+
@dataclass
27+
class EvalEnvDatasetConfig:
28+
"""Dataset-level generation parameters shared across delegate clients."""
29+
30+
name: str = ""
31+
n_samples_per_eval_prompt: Optional[int] = None
32+
temperature: Optional[float] = None
33+
top_p: Optional[float] = None
34+
top_k: Optional[int] = None
35+
max_response_len: Optional[int] = None
36+
37+
# TODO: This is ugly, temporarily leave this. We should unify all the config name for dataset, default, and args. (advice from Tom.)
38+
FIELD_SPECS = {
39+
"n_samples_per_eval_prompt": {
40+
"dataset_keys": ("n_samples_per_eval_prompt",),
41+
"default_keys": ("n_samples_per_eval_prompt",),
42+
"arg_attrs": ("n_samples_per_eval_prompt", "n_samples_per_prompt"),
43+
},
44+
"temperature": {
45+
"dataset_keys": ("temperature",),
46+
"default_keys": ("temperature",),
47+
"arg_attrs": ("eval_temperature", "rollout_temperature"),
48+
},
49+
"top_p": {
50+
"dataset_keys": ("top_p",),
51+
"default_keys": ("top_p",),
52+
"arg_attrs": ("eval_top_p", "rollout_top_p"),
53+
},
54+
"top_k": {
55+
"dataset_keys": ("top_k",),
56+
"default_keys": ("top_k",),
57+
"arg_attrs": ("eval_top_k", "rollout_top_k"),
58+
},
59+
"max_response_len": {
60+
"dataset_keys": ("max_response_len",),
61+
"default_keys": ("max_response_len",),
62+
"arg_attrs": ("eval_max_response_len", "rollout_max_response_len"),
63+
},
64+
}
65+
66+
@classmethod
67+
def parse(cls, args, dataset_cfg: Mapping[str, Any], defaults: Mapping[str, Any]) -> "EvalEnvDatasetConfig":
68+
"""Merge dataset overrides with defaults/CLI settings and coerce types via OmegaConf."""
69+
defaults = defaults or {}
70+
name = str(dataset_cfg.get("name", "")).strip()
71+
if not name:
72+
raise ValueError("Each delegate dataset entry must include a non-empty `name`.")
73+
if ":" in name:
74+
raise ValueError(
75+
"Colon in dataset name is not allowed; use `n_samples_per_eval_prompt` to configure samples per prompt."
76+
)
77+
78+
values: Dict[str, Any] = {"name": name}
79+
for field_name, spec in cls.FIELD_SPECS.items():
80+
dataset_value = _pick_from_mapping(dataset_cfg, spec["dataset_keys"])
81+
default_value = _pick_from_mapping(defaults, spec["default_keys"])
82+
arg_values = [getattr(args, attr, None) for attr in spec["arg_attrs"]]
83+
values[field_name] = _first_not_none(dataset_value, default_value, *arg_values)
84+
85+
cfg = OmegaConf.merge(OmegaConf.structured(cls), OmegaConf.create(values))
86+
obj = OmegaConf.to_object(cfg)
87+
if not isinstance(obj, cls):
88+
obj = cls(**obj)
89+
return obj
90+
91+
def to_payload(self) -> Dict[str, Any]:
92+
"""Return a JSON-serializable payload for this dataset configuration."""
93+
payload: Dict[str, Any] = {}
94+
for field_info in fields(self):
95+
value = getattr(self, field_info.name)
96+
if value is None:
97+
continue
98+
payload[field_info.name] = value
99+
return payload
100+
101+
102+
@dataclass
103+
class EvalEnvConfig:
104+
"""Environment definition shared across delegate implementations."""
105+
106+
name: str = ""
107+
url: Optional[str] = None
108+
timeout_secs: int = 3600
109+
max_retries: int = 1
110+
headers: Dict[str, Any] = field(default_factory=dict)
111+
defaults: Dict[str, Any] = field(default_factory=dict)
112+
113+
@classmethod
114+
def parse(cls, raw: Mapping[str, Any], defaults: Mapping[str, Any]) -> "EvalEnvConfig":
115+
cfg = OmegaConf.merge(OmegaConf.structured(cls), OmegaConf.create(raw or {}))
116+
obj = OmegaConf.to_object(cfg)
117+
if not isinstance(obj, cls):
118+
obj = cls(**obj)
119+
120+
return obj
121+
122+
123+
def _rebuild_delegate_config(
124+
args, raw_delegate_config: Optional[Sequence[Mapping[str, Any]]], defaults: Optional[Mapping[str, Any]]
125+
) -> List[EvalEnvConfig]:
126+
envs: List[EvalEnvConfig] = []
127+
defaults = defaults or {}
128+
for env in raw_delegate_config or []:
129+
env_name = str(env.get("name", "")).strip().lower()
130+
if not env_name:
131+
logger.warning("Each delegate entry must include a non-empty `name`.")
132+
continue
133+
if env_name == "skills":
134+
from examples.eval.nemo_skills.skills_config import build_skills_eval_env_config
135+
136+
env_cfg = build_skills_eval_env_config(args, env, defaults)
137+
if env_cfg is not None:
138+
envs.append(env_cfg)
139+
else:
140+
raise ValueError(f"Unknown delegate environment: {env_name}")
141+
return envs
142+
143+
144+
class EvalDelegateError(RuntimeError):
145+
"""Raised when the external evaluation server returns an error."""
146+
147+
148+
class EvalClient:
149+
name: str = ""
150+
151+
def __init__(self, name: str):
152+
self.name = name
153+
154+
def evaluate(self, args, rollout_id: int) -> tuple[Dict[str, Any], Dict[str, Any]]:
155+
raise NotImplementedError("Subclasses must implement this method")
156+
157+
158+
def _flatten(result: Dict[str, Any], prefix: Optional[str] = None) -> Dict[str, Any]:
159+
"""Flatten nested metric dicts into slash separated keys."""
160+
flattened: Dict[str, Any] = {}
161+
for key, value in (result or {}).items():
162+
full_key = f"{prefix}/{key}" if prefix else key
163+
if isinstance(value, dict):
164+
flattened.update(_flatten(value, full_key))
165+
else:
166+
flattened[full_key] = value
167+
return flattened
168+
169+
170+
class EvalDelegateClient:
171+
"""Aggregate multiple environment-specific delegate clients."""
172+
173+
def __init__(self, delegates: Sequence[EvalClient]):
174+
self._delegates = list(delegates)
175+
176+
@classmethod
177+
def maybe_create(
178+
cls, args, env_configs: Optional[Sequence[EvalEnvConfig]] = None
179+
) -> Optional["EvalDelegateClient"]:
180+
env_configs = list(env_configs) if env_configs is not None else getattr(args, "eval_delegate_config", None)
181+
if not env_configs:
182+
return None
183+
184+
router_addr = f"http://{args.sglang_router_ip}:{args.sglang_router_port}"
185+
delegates: List[EvalClient] = []
186+
for env_cfg in env_configs:
187+
delegate = cls._create_delegate(env_cfg, router_addr)
188+
if delegate is not None:
189+
delegates.append(delegate)
190+
if not delegates:
191+
return None
192+
return cls(delegates)
193+
194+
@staticmethod
195+
def _create_delegate(env_cfg: EvalEnvConfig, router_addr: str):
196+
env_name = env_cfg.name
197+
if env_name == "skills":
198+
from examples.eval.nemo_skills.skills_client import SkillsEvalClient
199+
200+
return SkillsEvalClient.from_config(env_cfg, router_addr)
201+
logger.warning("No delegate client registered for environment: %s", env_name)
202+
return None
203+
204+
def evaluate(self, args, rollout_id: int) -> tuple[Dict[str, Any], Dict[str, Any]]:
205+
aggregated_metrics: Dict[str, Any] = {}
206+
raw_responses: Dict[str, Any] = {}
207+
for delegate in self._delegates:
208+
metrics, response = delegate.evaluate(args, rollout_id)
209+
if metrics:
210+
aggregated_metrics.update(metrics)
211+
raw_responses[delegate.name] = response
212+
return aggregated_metrics, raw_responses

0 commit comments

Comments
 (0)