marin-community
diff --git a/‎experiments/evals/evals.py‎
Lines changed: 10 additions & 4 deletions b/‎experiments/evals/evals.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎lib/marin/src/marin/evaluation/evaluation_config.py‎
Lines changed: 1 addition & 6 deletions b/‎lib/marin/src/marin/evaluation/evaluation_config.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎lib/marin/src/marin/evaluation/evaluators/evalchemy_evaluator.py‎
Lines changed: 2 additions & 34 deletions b/‎lib/marin/src/marin/evaluation/evaluators/evalchemy_evaluator.py‎
Lines changed: 2 additions & 34 deletions
diff --git a/‎lib/marin/src/marin/evaluation/evaluators/evaluator.py‎
Lines changed: 0 additions & 99 deletions b/‎lib/marin/src/marin/evaluation/evaluators/evaluator.py‎
Lines changed: 0 additions & 99 deletions
diff --git a/‎lib/marin/src/marin/evaluation/evaluators/harbor_evaluator.py‎
Lines changed: 21 additions & 71 deletions b/‎lib/marin/src/marin/evaluation/evaluators/harbor_evaluator.py‎
Lines changed: 21 additions & 71 deletions
diff --git a/‎lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py‎
Lines changed: 9 additions & 18 deletions b/‎lib/marin/src/marin/evaluation/evaluators/levanter_lm_eval_evaluator.py‎
Lines changed: 9 additions & 18 deletions
@@ -11,6 +11,7 @@
 
 from fray.cluster import ResourceConfig
 from marin.evaluation.evaluation_config import EvalTaskConfig, EvaluationConfig
+from marin.evaluation.evaluators.harbor_evaluator import HARBOR_EVAL_ENV_KEYS, env_vars_from_keys
 from marin.evaluation.run import evaluate
 from marin.execution.remote import remote
 from marin.execution.executor import (
@@ -406,7 +407,7 @@ def evaluate_harbor(
         dataset: Harbor dataset name (e.g., "aime", "terminal-bench", "swebench-verified")
         version: Dataset version (e.g., "1.0", "2.0")
         max_eval_instances: Limit number of tasks to run
-        resource_config: Resource configuration for Ray
+        resource_config: Resource configuration for direct Iris execution
         apply_chat_template: Whether to apply chat template (not used by Harbor)
         wandb_tags: Tags for W&B logging
         generation_params: Generation parameters (not used by Harbor)
@@ -440,12 +441,17 @@ def evaluate_harbor(
         }
     }
 
-    # When model_path is set, the evaluator launches a fray sub-job for vLLM serving
-    # with the correct resources. The outer executor step runs on CPU.
+    # When model_path is set, the evaluator launches a colocated vLLM server on
+    # the accelerator resources. The outer executor step runs on CPU for API models.
     dispatch_resources = ResourceConfig.with_cpu() if model_path else resource_config
     return ExecutorStep(
         name=f"evaluation/harbor/{model_name}-{dataset}-{version}",
-        fn=remote(evaluate, resources=dispatch_resources, pip_dependency_groups=["harbor"]),
+        fn=remote(
+            evaluate,
+            resources=dispatch_resources,
+            env_vars=env_vars_from_keys(HARBOR_EVAL_ENV_KEYS),
+            pip_dependency_groups=["harbor"],
+        ),
         config=EvaluationConfig(
             evaluator="harbor",
             model_name=model_name,
 
@@ -34,7 +34,7 @@ class EvaluationConfig:
 
     resource_config: ResourceConfig
     """
-    Additional keyword arguments to pass to the Ray resources.
+    Resources to allocate for the eval step (passed to @remote).
     """
 
     model_name: str | None
@@ -71,11 +71,6 @@ class EvaluationConfig:
     Whether to discover the latest HF checkpoint in the model path.
     """
 
-    launch_with_ray: bool = False
-    """
-    Deprecated. Eval dispatch now uses Fray @remote via the executor.
-    """
-
     max_eval_instances: int | None = None
     """
     Maximum number of evaluation instances to run.
 
@@ -32,11 +32,10 @@
 import traceback
 from collections.abc import Sequence
 from typing import ClassVar
-from fray.v1.cluster import ResourceConfig
 from rigging.filesystem import filesystem as marin_filesystem
 
 from marin.evaluation.evaluation_config import WANDB_PROJECT, EvalTaskConfig
-from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig, launch_evaluate_with_ray
+from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig
 from marin.inference.vllm_server import resolve_model_name_or_path
 from marin.evaluation.utils import is_remote_path, upload_to_gcs
 
@@ -670,7 +669,7 @@ def _run_evalchemy_in_process(
         """Run evalchemy in-process using runpy instead of a subprocess.
 
         Executes the evalchemy CLI entrypoint (eval.eval) directly in the current
-        process. This ensures that when the Ray worker dies (due to error or preemption),
+        process. This ensures that when the worker dies (due to error or preemption),
         all TPU handles die with it — no orphaned subprocesses.
 
         Args:
@@ -1058,34 +1057,3 @@ def evaluate(
                 shutil.rmtree(self.RESULTS_PATH)
             if local_config_dir and os.path.exists(local_config_dir):
                 shutil.rmtree(local_config_dir, ignore_errors=True)
-
-    def launch_evaluate_with_ray(
-        self,
-        model: ModelConfig,
-        evals: Sequence[EvalTaskConfig],
-        output_path: str,
-        resource_config: ResourceConfig,
-        max_eval_instances: int | None = None,
-        wandb_tags: list[str] | None = None,
-    ) -> None:
-        """Launch evaluation on Ray cluster with TPU resources."""
-        env_vars = {"HF_ALLOW_CODE_EVAL": "1"}
-        wandb_api_key = os.environ.get("WANDB_API_KEY")
-        if wandb_api_key:
-            env_vars["WANDB_API_KEY"] = wandb_api_key
-        wandb_entity = os.environ.get("WANDB_ENTITY")
-        if wandb_entity:
-            env_vars["WANDB_ENTITY"] = wandb_entity
-
-        launch_evaluate_with_ray(
-            evaluator=self,
-            job_name="evalchemy-tpu-evaluation",
-            model=model,
-            evals=evals,
-            output_path=output_path,
-            resource_config=resource_config,
-            max_eval_instances=max_eval_instances,
-            wandb_tags=wandb_tags,
-            extras=("evalchemy", "tpu", "vllm"),
-            env_vars=env_vars,
-        )
@@ -2,29 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Any
 
-from fray.v1.cluster import Entrypoint, EnvironmentConfig, JobRequest, ResourceConfig, current_cluster
-
 from marin.evaluation.evaluation_config import EvalTaskConfig
-from marin.utils import remove_tpu_lockfile_on_exit
-from rigging.log_setup import configure_logging as _init_logging
-
-
-@dataclass(frozen=True)
-class Dependency:
-    """Represents a Python dependency e.g., transformers==4.9.2"""
-
-    name: str
-    """The name of the dependency e.g., transformers"""
-
-    version: str | None = None
-    """The version of the dependency e.g., 4.9.2"""
-
-    def __str__(self):
-        return f"{self.name}=={self.version}" if self.version else self.name
 
 
 @dataclass
@@ -57,29 +38,6 @@ class ModelConfig:
 
 
 class Evaluator(ABC):
-    @abstractmethod
-    def launch_evaluate_with_ray(
-        self,
-        model: ModelConfig,
-        evals: list[EvalTaskConfig],
-        output_path: str,
-        resource_config: ResourceConfig,
-        max_eval_instances: int | None = None,
-        wandb_tags: list[str] | None = None,
-    ) -> None:
-        """
-        Launches the evaluation run with Ray.
-
-        Args:
-            model (ModelConfig): The model configuration of the model we want to evaluate
-            evals (List[EvalTaskConfig]): The list of evaluations to run.
-            output_path (str): The path to save the evaluation results.
-            max_eval_instances (int | None): The maximum number of evaluation instances to run.
-            step (ExecutorStep | None): The step to evaluate. Used to get the config for the model and the trainer.
-            wandb_tags (list[str] | None): The tags to add to the wandb run.
-        """
-        pass
-
     @abstractmethod
     def evaluate(
         self,
@@ -91,60 +49,3 @@ def evaluate(
     ) -> None:
         """What to run to evaluate."""
         pass
-
-
-def launch_evaluate_with_ray(
-    *,
-    evaluator: Evaluator,
-    job_name: str,
-    model: ModelConfig,
-    evals: list[EvalTaskConfig],
-    output_path: str,
-    resource_config: ResourceConfig,
-    max_eval_instances: int | None = None,
-    wandb_tags: list[str] | None = None,
-    extras: Sequence[str] = (),
-    pip_packages: Sequence[str] = (),
-    env_vars: dict[str, str] | None = None,
-    configure_logging: bool = True,
-    max_retries_failure: int = 0,
-    max_retries_preemption: int = 1000,
-) -> None:
-    """Launch an evaluator on the Ray/Fray cluster."""
-
-    def launch(
-        model: ModelConfig,
-        evals: list[EvalTaskConfig],
-        output_path: str,
-        max_eval_instances: int | None = None,
-        wandb_tags: list[str] | None = None,
-    ) -> None:
-        if configure_logging:
-            import logging
-
-            _init_logging(level=logging.INFO)
-        evaluator.evaluate(model, evals, output_path, max_eval_instances, wandb_tags)
-
-    def _run() -> None:
-        with remove_tpu_lockfile_on_exit():
-            launch(model, evals, output_path, max_eval_instances, wandb_tags)
-
-    if resource_config is None:
-        resource_config = ResourceConfig()
-
-    job_request = JobRequest(
-        name=job_name,
-        entrypoint=Entrypoint.from_callable(_run),
-        resources=resource_config,
-        environment=EnvironmentConfig.create(
-            extras=list(extras),
-            pip_packages=list(pip_packages),
-            env_vars=env_vars,
-        ),
-        max_retries_failure=max_retries_failure,
-        max_retries_preemption=max_retries_preemption,
-    )
-
-    cluster = current_cluster()
-    job_id = cluster.launch(job_request)
-    cluster.wait(job_id, raise_on_failure=True)
@@ -24,13 +24,12 @@
 from pathlib import Path
 from typing import Any
 
-from fray.v1.cluster import ResourceConfig
 from rigging.filesystem import open_url
 
 from marin.evaluation.evaluation_config import EvalTaskConfig
-from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig, launch_evaluate_with_ray
+from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig
 from marin.evaluation.utils import download_from_gcs, is_remote_path, upload_to_gcs
-from marin.inference.vllm_server import VLLM_NATIVE_PIP_PACKAGES, VllmEnvironment, resolve_vllm_mode
+from marin.inference.vllm_server import VllmEnvironment
 from marin.utils import fsspec_exists, fsspec_glob
 
 logger = logging.getLogger(__name__)
@@ -45,6 +44,24 @@
     "output_cost_per_token": 0.0,
 }
 
+HARBOR_EVAL_ENV_KEYS = (
+    "WANDB_API_KEY",
+    "WANDB_ENTITY",
+    "WANDB_PROJECT",
+    "HF_TOKEN",
+    "ANTHROPIC_API_KEY",
+    "OPENAI_API_KEY",
+    "DAYTONA_API_KEY",
+    "E2B_API_KEY",
+    "MODAL_API_KEY",
+    "TPU_CI",
+    "MARIN_PREFIX",
+    "MARIN_VLLM_MODE",
+    "VLLM_ALLOW_LONG_MAX_MODEL_LEN",
+    "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION",
+    "VLLM_TPU_SKIP_PRECOMPILE",
+)
+
 
 def _sanitize_hosted_vllm_canonical_name(name: str) -> str:
     """Return a Harbor-safe canonical name for `hosted_vllm/<canonical>`.
@@ -68,7 +85,7 @@ def _sanitize_hosted_vllm_canonical_name(name: str) -> str:
     return candidate
 
 
-def _env_vars_from_keys(keys: list[str]) -> dict[str, str]:
+def env_vars_from_keys(keys: list[str] | tuple[str, ...]) -> dict[str, str]:
     env_vars: dict[str, str] = {}
     for key in keys:
         value = os.environ.get(key)
@@ -261,73 +278,6 @@ def evaluate(
                 version=version,
             )
 
-    def launch_evaluate_with_ray(
-        self,
-        model: ModelConfig,
-        evals: list[EvalTaskConfig],
-        output_path: str,
-        resource_config: ResourceConfig,
-        max_eval_instances: int | None = None,
-        wandb_tags: list[str] | None = None,
-    ) -> None:
-        """Launch Harbor evaluation with Fray.
-
-        For local models (`model.path` is set), this runs on the provided TPU/GPU
-        resources so vLLM can serve the model. For API models it runs in-process.
-        """
-
-        if model.path is None:
-            self.evaluate(
-                model=model,
-                evals=evals,
-                output_path=output_path,
-                max_eval_instances=max_eval_instances,
-                wandb_tags=wandb_tags,
-            )
-            return
-
-        mode_str = resolve_vllm_mode(None)
-        pip_packages = VLLM_NATIVE_PIP_PACKAGES if mode_str == "native" else ()
-
-        env_vars = _env_vars_from_keys(
-            [
-                "WANDB_API_KEY",
-                "WANDB_ENTITY",
-                "WANDB_PROJECT",
-                "HF_TOKEN",
-                "ANTHROPIC_API_KEY",
-                "OPENAI_API_KEY",
-                "DAYTONA_API_KEY",
-                "E2B_API_KEY",
-                "MODAL_API_KEY",
-                "TPU_CI",
-                "MARIN_PREFIX",
-                "MARIN_VLLM_MODE",
-                "VLLM_ALLOW_LONG_MAX_MODEL_LEN",
-                "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION",
-                "VLLM_TPU_SKIP_PRECOMPILE",
-            ]
-        )
-        env_vars.setdefault("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "1")
-        env_vars.setdefault("VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION", "1")
-        env_vars.setdefault("VLLM_TPU_SKIP_PRECOMPILE", "1")
-
-        launch_evaluate_with_ray(
-            evaluator=self,
-            job_name="harbor-vllm-eval",
-            model=model,
-            evals=evals,
-            output_path=output_path,
-            resource_config=resource_config,
-            max_eval_instances=max_eval_instances,
-            wandb_tags=wandb_tags,
-            extras=("harbor", "tpu", "vllm"),
-            pip_packages=pip_packages,
-            env_vars=env_vars,
-            max_retries_failure=0,
-            max_retries_preemption=10,
-        )
-
     def _run_eval_inner(
         self,
         model_name: str,
 
@@ -15,29 +15,20 @@
 from levanter.trainer import TrainerConfig
 
 from marin.evaluation.evaluation_config import EvalTaskConfig, convert_to_levanter_task_config
-from marin.evaluation.evaluators.evaluator import ModelConfig
-from marin.evaluation.evaluators.levanter_tpu_evaluator import LevanterTpuEvaluator
-from fray.v1.cluster.ray.deps import build_runtime_env_for_packages
+from marin.evaluation.evaluators.evaluator import Evaluator, ModelConfig
 
 logger = logging.getLogger(__name__)
 
 
-class LevanterLmEvalEvaluator(LevanterTpuEvaluator):
-    """For `Evaluator`s that runs inference with Levanter's Lm Eval Harness on TPUs."""
+class LevanterLmEvalEvaluator(Evaluator):
+    """Runs inference with Levanter's Lm Eval Harness on TPUs."""
 
-    def get_runtime_env(self) -> dict:
-        """
-        Returns the runtime environment to run the evaluator on the Ray cluster.
-        """
-        return build_runtime_env_for_packages(
-            extra=["eval", "tpu"],
-            pip_packages=["statsmodels==0.14.4"],
-            env_vars={
-                "TOKENIZERS_PARALLELISM": "false",
-                "HF_DATASETS_TRUST_REMOTE_CODE": "1",
-                "HF_ALLOW_CODE_EVAL": "1",
-            },
-        )
+    @staticmethod
+    def model_name_or_path(model: ModelConfig) -> str:
+        """Return a reference Levanter can read without staging to local disk."""
+        if model.path is None:
+            return model.name
+        return model.path
 
     def evaluate(
         self,