Skip to content

Commit 40c88f1

Browse files
yonromaiAlienKevin
andauthored
evals: drop Fray v1 launch_evaluate_with_ray scaffolding (#4453) (#4953)
## Summary - Removes the deprecated `EvaluationConfig.launch_with_ray` dispatch path from the eval tree. Net **−343 LOC** across 11 files. - No behavior change on production call-sites (they already defaulted `launch_with_ray=False`). `tests/evals/test_lm_eval.py` still runs in-process. - Part of #4453 (Migrate all supported Marin code off Ray). ## What's removed - `Evaluator.launch_evaluate_with_ray` abstract method + the module-level helper that built a v1 `JobRequest` and called `fray.v1.cluster.current_cluster().launch(...)`. - Per-evaluator `launch_evaluate_with_ray` methods across all six evaluator files. - Dead `Dependency` dataclass and dead `get_runtime_env` methods on the two lm-eval evaluators. - `EvaluationConfig.launch_with_ray: bool` field. - `_to_v1_resource_config` v2→v1 adapter in `run.py`. - All `fray.v1.*` imports in `lib/marin/src/marin/evaluation/`. `rg 'fray\.v1' lib/marin/src/marin/evaluation experiments/evals tests/evals` → 0 matches. - Intermediate `LevanterTpuEvaluator` base class (collapsed into `LevanterLmEvalEvaluator`). ## Remaining `fray.v1` surfaces Orthogonal to evals, tracked in the #4453 follow-up PR stack — see the analysis doc for the full decomposition. ## Test plan - [x] \`./infra/pre-commit.py --all-files\` — green (ruff / black / pyrefly / license headers) - [x] \`uv run --package marin --extra cpu pytest tests/test_evaluator_utils.py\` — 1 passed - [x] \`uv run --package marin --extra cpu pytest tests/test_dry_run.py\` — 74 passed, 77 skipped (infra-gated, unrelated) - [x] \`pytest tests/evals/test_lm_eval.py --collect-only -q -m tpu_ci\` — 2 collected, imports resolve without \`launch_with_ray\` - [x] **TPU smoke on v5p-8 us-east5-a:** \`pytest tests/evals/test_lm_eval.py::test_lm_eval_harness -v -m tpu_ci\` — **PASSED in 196.72s** (matches the \`eval_testing.md:192\` baseline exactly). vLLM cold-start + gsm8k × 1 + GCS upload. \`test_lm_eval_harness_levanter\` is skipped: known broken upstream on \`transformers\` API drift, pre-existing and not touched by this PR. --------- Co-authored-by: Kevin Li <kevinli020508@gmail.com> Co-authored-by: Romain Yon <1596570+yonromai@users.noreply.github.com>
1 parent e233ba1 commit 40c88f1

11 files changed

Lines changed: 54 additions & 403 deletions

File tree

experiments/evals/evals.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from fray.cluster import ResourceConfig
1313
from marin.evaluation.evaluation_config import EvalTaskConfig, EvaluationConfig
14+
from marin.evaluation.evaluators.harbor_evaluator import HARBOR_EVAL_ENV_KEYS, env_vars_from_keys
1415
from marin.evaluation.run import evaluate
1516
from marin.execution.remote import remote
1617
from marin.execution.executor import (
@@ -406,7 +407,7 @@ def evaluate_harbor(
406407
dataset: Harbor dataset name (e.g., "aime", "terminal-bench", "swebench-verified")
407408
version: Dataset version (e.g., "1.0", "2.0")
408409
max_eval_instances: Limit number of tasks to run
409-
resource_config: Resource configuration for Ray
410+
resource_config: Resource configuration for direct Iris execution
410411
apply_chat_template: Whether to apply chat template (not used by Harbor)
411412
wandb_tags: Tags for W&B logging
412413
generation_params: Generation parameters (not used by Harbor)
@@ -440,12 +441,17 @@ def evaluate_harbor(
440441
}
441442
}
442443

443-
# When model_path is set, the evaluator launches a fray sub-job for vLLM serving
444-
# with the correct resources. The outer executor step runs on CPU.
444+
# When model_path is set, the evaluator launches a colocated vLLM server on
445+
# the accelerator resources. The outer executor step runs on CPU for API models.
445446
dispatch_resources = ResourceConfig.with_cpu() if model_path else resource_config
446447
return ExecutorStep(
447448
name=f"evaluation/harbor/{model_name}-{dataset}-{version}",
448-
fn=remote(evaluate, resources=dispatch_resources, pip_dependency_groups=["harbor"]),
449+
fn=remote(
450+
evaluate,
451+
resources=dispatch_resources,
452+
env_vars=env_vars_from_keys(HARBOR_EVAL_ENV_KEYS),
453+
pip_dependency_groups=["harbor"],
454+
),
449455
config=EvaluationConfig(
450456
evaluator="harbor",
451457
model_name=model_name,

lib/marin/src/marin/evaluation/evaluation_config.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class EvaluationConfig:
3434

3535
resource_config: ResourceConfig
3636
"""
37-
Additional keyword arguments to pass to the Ray resources.
37+
Resources to allocate for the eval step (passed to @remote).
3838
"""
3939

4040
model_name: str | None
@@ -71,11 +71,6 @@ class EvaluationConfig:
7171
Whether to discover the latest HF checkpoint in the model path.
7272
"""
7373

74-
launch_with_ray: bool = False
75-
"""
76-
Deprecated. Eval dispatch now uses Fray @remote via the executor.
77-
"""
78-
7974
max_eval_instances: int | None = None
8075
"""
8176
Maximum number of evaluation instances to run.

lib/marin/src/marin/evaluation/evaluators/evalchemy_evaluator.py

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,10 @@
3232
import traceback
3333
from collections.abc import Sequence
3434
from typing import ClassVar
35-
from fray.v1.cluster import ResourceConfig
3635
from rigging.filesystem import filesystem as marin_filesystem
3736

3837
from marin.evaluation.evaluation_config import WANDB_PROJECT, EvalTaskConfig
39-
from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig, launch_evaluate_with_ray
38+
from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig
4039
from marin.inference.vllm_server import resolve_model_name_or_path
4140
from marin.evaluation.utils import is_remote_path, upload_to_gcs
4241

@@ -670,7 +669,7 @@ def _run_evalchemy_in_process(
670669
"""Run evalchemy in-process using runpy instead of a subprocess.
671670
672671
Executes the evalchemy CLI entrypoint (eval.eval) directly in the current
673-
process. This ensures that when the Ray worker dies (due to error or preemption),
672+
process. This ensures that when the worker dies (due to error or preemption),
674673
all TPU handles die with it — no orphaned subprocesses.
675674
676675
Args:
@@ -1058,34 +1057,3 @@ def evaluate(
10581057
shutil.rmtree(self.RESULTS_PATH)
10591058
if local_config_dir and os.path.exists(local_config_dir):
10601059
shutil.rmtree(local_config_dir, ignore_errors=True)
1061-
1062-
def launch_evaluate_with_ray(
1063-
self,
1064-
model: ModelConfig,
1065-
evals: Sequence[EvalTaskConfig],
1066-
output_path: str,
1067-
resource_config: ResourceConfig,
1068-
max_eval_instances: int | None = None,
1069-
wandb_tags: list[str] | None = None,
1070-
) -> None:
1071-
"""Launch evaluation on Ray cluster with TPU resources."""
1072-
env_vars = {"HF_ALLOW_CODE_EVAL": "1"}
1073-
wandb_api_key = os.environ.get("WANDB_API_KEY")
1074-
if wandb_api_key:
1075-
env_vars["WANDB_API_KEY"] = wandb_api_key
1076-
wandb_entity = os.environ.get("WANDB_ENTITY")
1077-
if wandb_entity:
1078-
env_vars["WANDB_ENTITY"] = wandb_entity
1079-
1080-
launch_evaluate_with_ray(
1081-
evaluator=self,
1082-
job_name="evalchemy-tpu-evaluation",
1083-
model=model,
1084-
evals=evals,
1085-
output_path=output_path,
1086-
resource_config=resource_config,
1087-
max_eval_instances=max_eval_instances,
1088-
wandb_tags=wandb_tags,
1089-
extras=("evalchemy", "tpu", "vllm"),
1090-
env_vars=env_vars,
1091-
)

lib/marin/src/marin/evaluation/evaluators/evaluator.py

Lines changed: 0 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,10 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
from abc import ABC, abstractmethod
5-
from collections.abc import Sequence
65
from dataclasses import dataclass
76
from typing import Any
87

9-
from fray.v1.cluster import Entrypoint, EnvironmentConfig, JobRequest, ResourceConfig, current_cluster
10-
118
from marin.evaluation.evaluation_config import EvalTaskConfig
12-
from marin.utils import remove_tpu_lockfile_on_exit
13-
from rigging.log_setup import configure_logging as _init_logging
14-
15-
16-
@dataclass(frozen=True)
17-
class Dependency:
18-
"""Represents a Python dependency e.g., transformers==4.9.2"""
19-
20-
name: str
21-
"""The name of the dependency e.g., transformers"""
22-
23-
version: str | None = None
24-
"""The version of the dependency e.g., 4.9.2"""
25-
26-
def __str__(self):
27-
return f"{self.name}=={self.version}" if self.version else self.name
289

2910

3011
@dataclass
@@ -57,29 +38,6 @@ class ModelConfig:
5738

5839

5940
class Evaluator(ABC):
60-
@abstractmethod
61-
def launch_evaluate_with_ray(
62-
self,
63-
model: ModelConfig,
64-
evals: list[EvalTaskConfig],
65-
output_path: str,
66-
resource_config: ResourceConfig,
67-
max_eval_instances: int | None = None,
68-
wandb_tags: list[str] | None = None,
69-
) -> None:
70-
"""
71-
Launches the evaluation run with Ray.
72-
73-
Args:
74-
model (ModelConfig): The model configuration of the model we want to evaluate
75-
evals (List[EvalTaskConfig]): The list of evaluations to run.
76-
output_path (str): The path to save the evaluation results.
77-
max_eval_instances (int | None): The maximum number of evaluation instances to run.
78-
step (ExecutorStep | None): The step to evaluate. Used to get the config for the model and the trainer.
79-
wandb_tags (list[str] | None): The tags to add to the wandb run.
80-
"""
81-
pass
82-
8341
@abstractmethod
8442
def evaluate(
8543
self,
@@ -91,60 +49,3 @@ def evaluate(
9149
) -> None:
9250
"""What to run to evaluate."""
9351
pass
94-
95-
96-
def launch_evaluate_with_ray(
97-
*,
98-
evaluator: Evaluator,
99-
job_name: str,
100-
model: ModelConfig,
101-
evals: list[EvalTaskConfig],
102-
output_path: str,
103-
resource_config: ResourceConfig,
104-
max_eval_instances: int | None = None,
105-
wandb_tags: list[str] | None = None,
106-
extras: Sequence[str] = (),
107-
pip_packages: Sequence[str] = (),
108-
env_vars: dict[str, str] | None = None,
109-
configure_logging: bool = True,
110-
max_retries_failure: int = 0,
111-
max_retries_preemption: int = 1000,
112-
) -> None:
113-
"""Launch an evaluator on the Ray/Fray cluster."""
114-
115-
def launch(
116-
model: ModelConfig,
117-
evals: list[EvalTaskConfig],
118-
output_path: str,
119-
max_eval_instances: int | None = None,
120-
wandb_tags: list[str] | None = None,
121-
) -> None:
122-
if configure_logging:
123-
import logging
124-
125-
_init_logging(level=logging.INFO)
126-
evaluator.evaluate(model, evals, output_path, max_eval_instances, wandb_tags)
127-
128-
def _run() -> None:
129-
with remove_tpu_lockfile_on_exit():
130-
launch(model, evals, output_path, max_eval_instances, wandb_tags)
131-
132-
if resource_config is None:
133-
resource_config = ResourceConfig()
134-
135-
job_request = JobRequest(
136-
name=job_name,
137-
entrypoint=Entrypoint.from_callable(_run),
138-
resources=resource_config,
139-
environment=EnvironmentConfig.create(
140-
extras=list(extras),
141-
pip_packages=list(pip_packages),
142-
env_vars=env_vars,
143-
),
144-
max_retries_failure=max_retries_failure,
145-
max_retries_preemption=max_retries_preemption,
146-
)
147-
148-
cluster = current_cluster()
149-
job_id = cluster.launch(job_request)
150-
cluster.wait(job_id, raise_on_failure=True)

lib/marin/src/marin/evaluation/evaluators/harbor_evaluator.py

Lines changed: 21 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,12 @@
2424
from pathlib import Path
2525
from typing import Any
2626

27-
from fray.v1.cluster import ResourceConfig
2827
from rigging.filesystem import open_url
2928

3029
from marin.evaluation.evaluation_config import EvalTaskConfig
31-
from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig, launch_evaluate_with_ray
30+
from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig
3231
from marin.evaluation.utils import download_from_gcs, is_remote_path, upload_to_gcs
33-
from marin.inference.vllm_server import VLLM_NATIVE_PIP_PACKAGES, VllmEnvironment, resolve_vllm_mode
32+
from marin.inference.vllm_server import VllmEnvironment
3433
from marin.utils import fsspec_exists, fsspec_glob
3534

3635
logger = logging.getLogger(__name__)
@@ -45,6 +44,24 @@
4544
"output_cost_per_token": 0.0,
4645
}
4746

47+
HARBOR_EVAL_ENV_KEYS = (
48+
"WANDB_API_KEY",
49+
"WANDB_ENTITY",
50+
"WANDB_PROJECT",
51+
"HF_TOKEN",
52+
"ANTHROPIC_API_KEY",
53+
"OPENAI_API_KEY",
54+
"DAYTONA_API_KEY",
55+
"E2B_API_KEY",
56+
"MODAL_API_KEY",
57+
"TPU_CI",
58+
"MARIN_PREFIX",
59+
"MARIN_VLLM_MODE",
60+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN",
61+
"VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION",
62+
"VLLM_TPU_SKIP_PRECOMPILE",
63+
)
64+
4865

4966
def _sanitize_hosted_vllm_canonical_name(name: str) -> str:
5067
"""Return a Harbor-safe canonical name for `hosted_vllm/<canonical>`.
@@ -68,7 +85,7 @@ def _sanitize_hosted_vllm_canonical_name(name: str) -> str:
6885
return candidate
6986

7087

71-
def _env_vars_from_keys(keys: list[str]) -> dict[str, str]:
88+
def env_vars_from_keys(keys: list[str] | tuple[str, ...]) -> dict[str, str]:
7289
env_vars: dict[str, str] = {}
7390
for key in keys:
7491
value = os.environ.get(key)
@@ -261,73 +278,6 @@ def evaluate(
261278
version=version,
262279
)
263280

264-
def launch_evaluate_with_ray(
265-
self,
266-
model: ModelConfig,
267-
evals: list[EvalTaskConfig],
268-
output_path: str,
269-
resource_config: ResourceConfig,
270-
max_eval_instances: int | None = None,
271-
wandb_tags: list[str] | None = None,
272-
) -> None:
273-
"""Launch Harbor evaluation with Fray.
274-
275-
For local models (`model.path` is set), this runs on the provided TPU/GPU
276-
resources so vLLM can serve the model. For API models it runs in-process.
277-
"""
278-
279-
if model.path is None:
280-
self.evaluate(
281-
model=model,
282-
evals=evals,
283-
output_path=output_path,
284-
max_eval_instances=max_eval_instances,
285-
wandb_tags=wandb_tags,
286-
)
287-
return
288-
289-
mode_str = resolve_vllm_mode(None)
290-
pip_packages = VLLM_NATIVE_PIP_PACKAGES if mode_str == "native" else ()
291-
292-
env_vars = _env_vars_from_keys(
293-
[
294-
"WANDB_API_KEY",
295-
"WANDB_ENTITY",
296-
"WANDB_PROJECT",
297-
"HF_TOKEN",
298-
"ANTHROPIC_API_KEY",
299-
"OPENAI_API_KEY",
300-
"DAYTONA_API_KEY",
301-
"E2B_API_KEY",
302-
"MODAL_API_KEY",
303-
"TPU_CI",
304-
"MARIN_PREFIX",
305-
"MARIN_VLLM_MODE",
306-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN",
307-
"VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION",
308-
"VLLM_TPU_SKIP_PRECOMPILE",
309-
]
310-
)
311-
env_vars.setdefault("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "1")
312-
env_vars.setdefault("VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION", "1")
313-
env_vars.setdefault("VLLM_TPU_SKIP_PRECOMPILE", "1")
314-
315-
launch_evaluate_with_ray(
316-
evaluator=self,
317-
job_name="harbor-vllm-eval",
318-
model=model,
319-
evals=evals,
320-
output_path=output_path,
321-
resource_config=resource_config,
322-
max_eval_instances=max_eval_instances,
323-
wandb_tags=wandb_tags,
324-
extras=("harbor", "tpu", "vllm"),
325-
pip_packages=pip_packages,
326-
env_vars=env_vars,
327-
max_retries_failure=0,
328-
max_retries_preemption=10,
329-
)
330-
331281
def _run_eval_inner(
332282
self,
333283
model_name: str,

lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,20 @@
1515
from levanter.trainer import TrainerConfig
1616

1717
from marin.evaluation.evaluation_config import EvalTaskConfig, convert_to_levanter_task_config
18-
from marin.evaluation.evaluators.evaluator import ModelConfig
19-
from marin.evaluation.evaluators.levanter_tpu_evaluator import LevanterTpuEvaluator
20-
from fray.v1.cluster.ray.deps import build_runtime_env_for_packages
18+
from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig
2119

2220
logger = logging.getLogger(__name__)
2321

2422

25-
class LevanterLmEvalEvaluator(LevanterTpuEvaluator):
26-
"""For `Evaluator`s that runs inference with Levanter's Lm Eval Harness on TPUs."""
23+
class LevanterLmEvalEvaluator(Evaluator):
24+
"""Runs inference with Levanter's Lm Eval Harness on TPUs."""
2725

28-
def get_runtime_env(self) -> dict:
29-
"""
30-
Returns the runtime environment to run the evaluator on the Ray cluster.
31-
"""
32-
return build_runtime_env_for_packages(
33-
extra=["eval", "tpu"],
34-
pip_packages=["statsmodels==0.14.4"],
35-
env_vars={
36-
"TOKENIZERS_PARALLELISM": "false",
37-
"HF_DATASETS_TRUST_REMOTE_CODE": "1",
38-
"HF_ALLOW_CODE_EVAL": "1",
39-
},
40-
)
26+
@staticmethod
27+
def model_name_or_path(model: ModelConfig) -> str:
28+
"""Return a reference Levanter can read without staging to local disk."""
29+
if model.path is None:
30+
return model.name
31+
return model.path
4132

4233
def evaluate(
4334
self,

0 commit comments

Comments
 (0)