marin-community
diff --git a/‎README.md‎
Lines changed: 3 additions & 2 deletions b/‎README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/references/resource-config.md‎
Lines changed: 31 additions & 7 deletions b/‎docs/references/resource-config.md‎
Lines changed: 31 additions & 7 deletions
diff --git a/‎docs/tutorials/first-experiment.md‎
Lines changed: 7 additions & 7 deletions b/‎docs/tutorials/first-experiment.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/tutorials/run-alpaca-eval.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/tutorials/run-alpaca-eval.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/tutorials/run-lm-evals.md‎
Lines changed: 8 additions & 8 deletions b/‎docs/tutorials/run-lm-evals.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎experiments/anneal_config.py‎
Lines changed: 3 additions & 17 deletions b/‎experiments/anneal_config.py‎
Lines changed: 3 additions & 17 deletions
diff --git a/‎experiments/cooldown_quality.py‎
Lines changed: 5 additions & 5 deletions b/‎experiments/cooldown_quality.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎experiments/datashop/datashop_runner.py‎
Lines changed: 3 additions & 3 deletions b/‎experiments/datashop/datashop_runner.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎experiments/datashop/default_configs.py‎
Lines changed: 1 addition & 4 deletions b/‎experiments/datashop/default_configs.py‎
Lines changed: 1 addition & 4 deletions
@@ -53,11 +53,12 @@ You can check out the [full script](https://github.com/marin-community/marin/blo
 <!--marin-example-start-->
 
 ```python
+from fray.cluster import ResourceConfig
+
 from experiments.defaults import default_tokenize, default_train
 from experiments.llama import llama3_tokenizer, llama_nano
 from experiments.simple_train_config import SimpleTrainConfig
 from marin.execution.executor import executor_main
-from marin.resources import CpuOnlyConfig
 
 # 1. Choose a dataset
 tinystories_hf_id = "roneneldan/TinyStories"
@@ -72,7 +73,7 @@ tinystories_tokenized = default_tokenize(
 # 3. Define training configuration
 nano_train_config = SimpleTrainConfig(
     # Here we define the hardware resources we need.
-    resources=CpuOnlyConfig(num_cpus=1),
+    resources=ResourceConfig.with_cpu(),
     train_batch_size=4,
     num_train_steps=100,
     # set hyperparameters
 
@@ -1,17 +1,41 @@
 # Hardware Resource Configuration
 
 Marin uses Ray for scheduling and resource management. Ray provides a flexible resource model that allows you to specify
-the resources that a task requires. In Marin, we specify a few wrapper types for common hardware configurations.
+the resources that a task requires. The `fray` library provides unified resource configuration types.
 
+## ResourceConfig
 
-## CPU-Only
+The main entry point for resource configuration. Use the static factory methods to create configurations:
 
-::: marin.resources.CpuOnlyConfig
+```python
+from fray.cluster import ResourceConfig
 
-## GPU
+# TPU configuration
+tpu_config = ResourceConfig.with_tpu("v4-8")
+tpu_multislice = ResourceConfig.with_tpu("v4-8", slice_count=2)
 
-::: marin.resources.GpuConfig
+# GPU configuration
+gpu_config = ResourceConfig.with_gpu("H100", count=8)
+gpu_auto = ResourceConfig.with_gpu()  # auto-detect GPU type
 
-## TPU
+# CPU-only configuration
+cpu_config = ResourceConfig.with_cpu()
+```
 
-::: marin.resources.TpuPodConfig
+::: fray.cluster.base.ResourceConfig
+
+## Device Configurations
+
+These are the underlying device types wrapped by `ResourceConfig`:
+
+### CPU
+
+::: fray.cluster.base.CpuConfig
+
+### GPU
+
+::: fray.cluster.base.GpuConfig
+
+### TPU
+
+::: fray.cluster.base.TpuConfig
@@ -78,16 +78,16 @@ For this tutorial, we will use the `SimpleTrainConfig` class from `experiments.s
 This class defines basic training configuration that is sufficient for most experiments.
 
 !!! info "Training Configuration for Different Accelerators"
-    You need to provide the appropriate resource configuration based on your hardware setup. Marin supports different accelerator types through various [resource configurations](../references/resource-config.md). The `CpuOnlyConfig` is one such resource configuration that requests a certain number of CPUs. Other resource configurations include `GpuConfig` for requesting GPUs and `TpuPodConfig` for requesting TPUs.
+    You need to provide the appropriate resource configuration based on your hardware setup. Marin supports different accelerator types through [`ResourceConfig`](../references/resource-config.md) factory methods.
 
     === "CPU"
         ```python
-        from marin.resources import CpuOnlyConfig
+        from fray.cluster import ResourceConfig
         from experiments.simple_train_config import SimpleTrainConfig
 
         nano_train_config = SimpleTrainConfig(
             # Here we define the hardware resources we need.
-            resources=CpuOnlyConfig(num_cpus=1),
+            resources=ResourceConfig.with_cpu(),
             train_batch_size=4,
             num_train_steps=100,
             # set hyperparameters
@@ -100,12 +100,12 @@ This class defines basic training configuration that is sufficient for most expe
 
     === "GPU"
         ```python
-        from marin.resources import GpuConfig
+        from fray.cluster import ResourceConfig
         from experiments.simple_train_config import SimpleTrainConfig
 
         nano_train_config = SimpleTrainConfig(
             # Here we define the hardware resources we need.
-            resources=GpuConfig(gpu_count=1),
+            resources=ResourceConfig.with_gpu(count=1),
             train_batch_size=32,
             num_train_steps=100,
             # set hyperparameters
@@ -116,12 +116,12 @@ This class defines basic training configuration that is sufficient for most expe
 
     === "TPU"
         ```python
-        from marin.resources import TpuPodConfig
+        from fray.cluster import ResourceConfig
         from experiments.simple_train_config import SimpleTrainConfig
 
         nano_train_config = SimpleTrainConfig(
             # Here we define the hardware resources we need.
-            resources=TpuPodConfig(tpu_type="v4-8"),
+            resources=ResourceConfig.with_tpu("v4-8"),
             train_batch_size=4,
             num_train_steps=100,
             # set hyperparameters
 
@@ -12,8 +12,8 @@ This tutorial shows how to configure and launch the Alpaca evaluation pipeline i
 The default evaluation script for alpaca is `experiments/evals/run_alpaca_eval.py`), if for some reason you want to make your own script import:
 
 ```python
+from fray.cluster import ResourceConfig
 from experiments.evals.engine_configs import DEFAULT_VLLM_ENGINE_KWARGS
-from experiments.evals.resource_configs import SINGLE_TPU_V6E_8
 from experiments.evals.evals          import evaluate_alpaca_eval
 from marin.execution.executor         import ExecutorMainConfig, executor_main
 ```
@@ -22,9 +22,9 @@ from marin.execution.executor         import ExecutorMainConfig, executor_main
 
 ```python
 # nodryrun
+from fray.cluster import ResourceConfig
 from experiments.evals.engine_configs import DEFAULT_VLLM_ENGINE_KWARGS
 from experiments.evals.evals          import evaluate_alpaca_eval
-from experiments.evals.resource_configs import SINGLE_TPU_V6E_8
 from marin.execution.executor         import ExecutorMainConfig, executor_main
 
 # Retry any failed steps by default
@@ -34,7 +34,7 @@ steps = [
     evaluate_alpaca_eval(
         model_name="my_alpaca_model_eval",              # Name for logging / W&B
         model_path="path/to/your/model/checkpoint/hf/",  # HF checkpoint directory
-        resource_config=SINGLE_TPU_V6E_8,                 # E.g., TPU v6e-8; choose GPU/TPU config
+        resource_config=ResourceConfig.with_tpu("v6e-8"),   # E.g., TPU v6e-8; choose GPU/TPU config
         engine_kwargs=DEFAULT_VLLM_ENGINE_KWARGS,         # vLLM backend parameters
 
         # IMPORTANT: stop_token_ids must include the eos_token_id of your HF model.
@@ -65,7 +65,7 @@ if __name__ == "__main__":
 |----------------------|----------------------|-------------|
 | model_name           | `str`                | Name for experiment tracking through executor framework. |
 | model_path           | `str`                | Path on GCP or URL to HF-format model checkpoint. |
-| resource_config      | `ResourceConfig`     | Hardware spec (e.g. `SINGLE_TPU_V6E_8`). |
+| resource_config      | `ResourceConfig`  | Hardware spec (e.g. `ResourceConfig.with_tpu("v6e-8")`). |
 | engine_kwargs        | `dict  None`   | vLLM engine settings (e.g. batch size, sequence length). |
 | max_eval_instances   | `int  None`    | Limits the number of examples to evaluate; `None` = all. |
 | temperature          | `float`              | Sampling temperature. |
 
@@ -25,7 +25,7 @@ from experiments.evals.task_configs import (
 )
 
 # Hardware / executor
-from experiments.evals.resource_configs import SINGLE_TPU_V4_8, SINGLE_TPU_V6E_8
+from fray.cluster import ResourceConfig
 from marin.execution.executor import executor_main
 from marin.execution.executor import ExecutorMainConfig  # for retry logic
 ```
@@ -36,8 +36,8 @@ Run the canonical CORE_TASKS (subset of DCLM tasks) via LM Evaluation Harness:
 
 ```python
 # run_mcqa_eval.py
+from fray.cluster import ResourceConfig
 from experiments.evals.evals import default_eval
-from experiments.evals.resource_configs import SINGLE_TPU_V4_8
 from marin.execution.executor import executor_main
 
 # Example: evaluate a standalone checkpoint
@@ -46,7 +46,7 @@ model_path = "gs://marin-us-east5/gcsfuse_mount/perplexity-models/llama-200m"
 # This creates an ExecutorStep that runs CORE_TASKS
 core_evals_step = default_eval(
     step=model_path,
-    resource_config=SINGLE_TPU_V4_8,
+    resource_config=ResourceConfig.with_tpu("v4-8"),
     # Optional: override the task set:
     # evals=CORE_TASKS_PLUS_MMLU,
     # max_eval_instances=100,
@@ -65,8 +65,8 @@ Use `default_key_evals` to run a collection of generation tasks (`KEY_GENERATION
 
 ```python
 # run_key_evals.py  (see 1:18:experiments/evals/run_key_evals.py)
+from fray.cluster import ResourceConfig
 from experiments.evals.evals import default_key_evals
-from experiments.evals.resource_configs import SINGLE_TPU_V6E_8
 from marin.execution.executor import executor_main
 
 # Point to your checkpoint or a training ExecutorStep
@@ -78,7 +78,7 @@ model_path = "gs://marin-us-east5/gcsfuse_mount/perplexity-models/llama-200m"
 #  3) Alpaca eval
 key_steps = default_key_evals(
     step=model_path,
-    resource_config=SINGLE_TPU_V6E_8,
+    resource_config=ResourceConfig.with_tpu("v6e-8"),
     model_name="my_key_evals",
     # max_eval_instances=50,
 )
@@ -104,7 +104,7 @@ from experiments.evals.evals import evaluate_alpaca_eval
 alpaca_step = evaluate_alpaca_eval(
     model_name="my_model",
     model_path="...",
-    resource_config=SINGLE_TPU_V6E_8,
+    resource_config=ResourceConfig.with_tpu("v6e-8"),
     engine_kwargs=DEFAULT_VLLM_ENGINE_KWARGS,
     stop_token_ids=[<YOUR_EOS_TOKEN_ID>],  # must match your HF model's eos_token_id
 )
@@ -117,8 +117,8 @@ alpaca_step = evaluate_alpaca_eval(
 If you want fine‐grained control over which tasks to run:
 
 ```python
+from fray.cluster import ResourceConfig
 from experiments.evals.evals import evaluate_lm_evaluation_harness
-from experiments.evals.resource_configs import SINGLE_TPU_V4_8
 from marin.evaluation.evaluation_config import EvalTaskConfig
 from marin.execution.executor import executor_main
 
@@ -132,7 +132,7 @@ custom_step = evaluate_lm_evaluation_harness(
     model_name="custom_eval",
     model_path="...",
     evals=custom_tasks,
-    resource_config=SINGLE_TPU_V4_8,
+    resource_config=ResourceConfig.with_tpu("v4-8"),
     max_eval_instances=200,
 )
 
 
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
+from fray.cluster import ResourceConfig
 from marin.execution import InputName
 from marin.processing.tokenize.data_configs import LMMixtureDatasetConfig
-from marin.resources import ResourceConfig, TpuPodConfig
 
 
 @dataclass(frozen=True)
@@ -54,7 +54,7 @@ class AnnealConfig:
 
     # Hardware related
     # The number of TPUs to use, type of TPU, and the number of pods to use.
-    resources: ResourceConfig = TpuPodConfig(tpu_type="v4-128", slice_count=2)  # noqa: RUF009
+    resources: ResourceConfig = field(default_factory=lambda: ResourceConfig.with_tpu("v4-128", slice_count=2))
 
     # Checkpoint related
     # The number of steps between saving checkpoints. Larger values will save checkpoints more frequently.
@@ -64,17 +64,3 @@ class AnnealConfig:
     # This argument is used in the default_train. If set to True, the validation set is Paloma.
     # If set to False, we will not calculate validation loss.
     use_default_validation: bool = True
-
-    @property
-    def tpu_type(self) -> str | None:
-        """For backward compatibility."""
-        if isinstance(self.resources, TpuPodConfig):
-            return self.resources.tpu_type
-        return None
-
-    @property
-    def node_count(self) -> int:
-        """For backward compatibility."""
-        if isinstance(self.resources, TpuPodConfig):
-            return self.resources.slice_count
-        return 1
@@ -23,16 +23,16 @@
 and determine their relative contributions to model performance.
 """
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 
 from experiments.anneal_config import AnnealConfig
-from experiments.pretraining_datasets.dclm import dclm_components_llama3
 from experiments.defaults import default_anneal
 from experiments.pretraining_datasets import tokenize_dolma
+from experiments.pretraining_datasets.dclm import dclm_components_llama3
+from fray.cluster import ResourceConfig
 from marin.execution.executor import ExecutorStep
-from marin.processing.tokenize.data_configs import TokenizerStep, lm_mixture_data_config, PermutationType
-from marin.resources import TpuPodConfig
+from marin.processing.tokenize.data_configs import PermutationType, TokenizerStep, lm_mixture_data_config
 
 
 @dataclass(frozen=True)
@@ -48,7 +48,7 @@ class QualityAblationConfig:
 
     # Training parameters
     num_anneal_tokens: int = 50_000_000_000
-    resources: TpuPodConfig = TpuPodConfig(tpu_type="v5litepod-128")  # noqa: RUF009
+    resources: ResourceConfig = field(default_factory=lambda: ResourceConfig.with_tpu("v5litepod-128"))
 
     # Naming
     model_name_prefix: str = "8b-quality-eval"
 
@@ -39,7 +39,7 @@
     default_train_quality_model,
 )
 from experiments.evals.evals import default_eval
-from experiments.evals.resource_configs import SINGLE_TPU_V6E_8, TPU_V6E_8_STRICT_PACK, ResourceConfig
+from fray.cluster import ResourceConfig
 from experiments.evals.task_configs import MMLU_5_SHOT
 from marin.datashop.pipeline import CorpusContent
 from marin.execution.executor import executor_main
@@ -73,7 +73,7 @@ class DatashopRunnerConfig:
     pretraining_data_path_name: str = "datashop-dclm-pretraining-subset"
 
     # How to schedule the TPUs (what hardware to use and how to pack them) specifically for labeling
-    labeler_resource_config: ResourceConfig = field(default_factory=lambda: TPU_V6E_8_STRICT_PACK)
+    labeler_resource_config: ResourceConfig = field(default_factory=lambda: ResourceConfig.with_tpu("v6e-8"))
 
     # What hardware to use for training the final model
     training_tpu_type: str = "v6e-128"
@@ -100,7 +100,7 @@ class DatashopRunnerConfig:
     consolidate_config_kwargs: dict | None = None
 
     # What hardware to use for evaluating the model
-    eval_resource_config: ResourceConfig = field(default_factory=lambda: SINGLE_TPU_V6E_8)
+    eval_resource_config: ResourceConfig = field(default_factory=lambda: ResourceConfig.with_tpu("v6e-8"))
 
 
 class DatashopRunner:
 
@@ -67,10 +67,7 @@
 
 default_inference_config_kwargs = {
     "model_type": "gte",
-    "runtime": RuntimeConfig(
-        memory_limit_gb=12,
-        resources={"TPU": 1},
-    ),
+    "runtime": RuntimeConfig(memory_limit_gb=12, resources={"TPU": 1}),
     "task": TaskConfig(max_in_flight=500),
     "filetype": "jsonl.zst",
     "classifier_kwargs": {"max_length": 512},