marin-community
diff --git a/‎.github/workflows/marin-canary-ferry-coreweave.yaml‎
Lines changed: 11 additions & 0 deletions b/‎.github/workflows/marin-canary-ferry-coreweave.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎experiments/defaults.py‎
Lines changed: 1 addition & 1 deletion b/‎experiments/defaults.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎experiments/ferries/canary_ferry.py‎
Lines changed: 44 additions & 4 deletions b/‎experiments/ferries/canary_ferry.py‎
Lines changed: 44 additions & 4 deletions
diff --git a/‎experiments/grug/moe/model.py‎
Lines changed: 7 additions & 3 deletions b/‎experiments/grug/moe/model.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎experiments/tutorials/train_tiny_sweep_tpu.py‎
Lines changed: 2 additions & 1 deletion b/‎experiments/tutorials/train_tiny_sweep_tpu.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/levanter/pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎lib/levanter/pyproject.toml‎
Lines changed: 3 additions & 0 deletions
@@ -26,6 +26,15 @@ on:
         type: number
         default: 1
         required: false
+      attention_implementation:
+        description: 'GPU attention backend'
+        type: choice
+        options:
+          - gpu_fa4_cute
+          - gpu_fa4_thd
+          - reference
+        default: gpu_fa4_cute
+        required: false
 
 permissions:
   contents: read   # actions/checkout
@@ -48,6 +57,7 @@ jobs:
     env:
       RUN_ID: canary-gpu-${{ github.run_id }}-${{ github.run_attempt }}
       CANARY_ACCELERATOR: gpu
+      CANARY_ATTENTION_IMPLEMENTATION: ${{ github.event_name == 'workflow_dispatch' && inputs.attention_implementation || 'gpu_fa4_cute' }}
       CANARY_BATCH_SIZE: "16"
       CANARY_GPU_REPLICAS: ${{ github.event_name == 'workflow_dispatch' && format('{0}', inputs.gpu_replicas) || '1' }}
       # TODO(#5524): remove this override once Levanter profiler stop is
@@ -123,6 +133,7 @@ jobs:
             -e MARIN_PREFIX s3://marin-na/marin/ \
             -e RUN_ID "$RUN_ID" \
             -e CANARY_ACCELERATOR "$CANARY_ACCELERATOR" \
+            -e CANARY_ATTENTION_IMPLEMENTATION "$CANARY_ATTENTION_IMPLEMENTATION" \
             -e CANARY_BATCH_SIZE "$CANARY_BATCH_SIZE" \
             -e CANARY_GPU_REPLICAS "$CANARY_GPU_REPLICAS" \
             -e CANARY_PROFILER_NUM_STEPS "$CANARY_PROFILER_NUM_STEPS" \
 
@@ -48,12 +48,12 @@
     lm_mixture_data_config,
 )
 from marin.processing.tokenize.tokenize import HfTokenizeConfig, TokenizeConfigBase
+from marin.training.run_environment import extras_for_resources
 from marin.training.training import (
     TrainDpoOnPodConfig,
     TrainLmOnPodConfig,
     bake_output_path,
     check_train_config_paths,
-    extras_for_resources,
     impute_run_id,
     resolve_training_env,
     run_levanter_train_dpo,
 
@@ -8,6 +8,7 @@
 to the Iris container. workflow_dispatch inputs override CANARY_TARGET_TOKENS.
 
     CANARY_ACCELERATOR   tpu | gpu
+    CANARY_ATTENTION_IMPLEMENTATION gpu-only attention backend, e.g. gpu_fa4_cute
     CANARY_TPU_TYPE      tpu-only comma-separated slice types, primary first (default v5p-8,v4-8)
     CANARY_BATCH_SIZE    per-device batch size
     CANARY_CACHE_COPY_MAX_WORKERS gpu-only cache-copy worker cap
@@ -26,11 +27,15 @@
     RUN_ID               unique run identifier
 """
 
+import dataclasses
 import datetime
 import os
+from typing import cast
 
 from fray.cluster import ResourceConfig
 from levanter.callbacks.profiler import ProfilerConfig
+from levanter.data.text import DatasetComponent
+from levanter.grug.attention import GrugAttentionImplementation
 from levanter.optim import AdamConfig
 from levanter.tracker.json_logger import JsonLoggerConfig
 from levanter.tracker.wandb import WandbConfig
@@ -62,6 +67,13 @@
     ema_beta=None,
     log_every=1,
 )
+_GPU_FA4_CUTE_ATTENTION: GrugAttentionImplementation = "gpu_fa4_cute"
+_GPU_FA4_THD_ATTENTION: GrugAttentionImplementation = "gpu_fa4_thd"
+_GPU_ATTENTION_IMPLEMENTATIONS: tuple[GrugAttentionImplementation, ...] = (
+    "reference",
+    _GPU_FA4_CUTE_ATTENTION,
+    _GPU_FA4_THD_ATTENTION,
+)
 
 # Compute budget passed to the heuristic when CANARY_HIDDEN_DIM scales the model.
 # Only the model *shape* (from hidden_dim) is used here; the budget-derived batch
@@ -130,10 +142,37 @@ def _build_step_from_env() -> ExecutorStep:
         else:
             model, _, _, _ = build_from_heuristic(budget=_HEURISTIC_BUDGET, hidden_dim=hidden_dim)
 
+        attention_implementation = os.environ.get("CANARY_ATTENTION_IMPLEMENTATION", _GPU_FA4_CUTE_ATTENTION)
+        if attention_implementation not in _GPU_ATTENTION_IMPLEMENTATIONS:
+            raise ValueError(
+                f"Unknown CANARY_ATTENTION_IMPLEMENTATION={attention_implementation!r}, expected one of "
+                f"{_GPU_ATTENTION_IMPLEMENTATIONS}"
+            )
+        attention_implementation = cast(GrugAttentionImplementation, attention_implementation)
+        model = dataclasses.replace(
+            model,
+            attention_implementation=attention_implementation,
+            # The THD backend only handles full causal windows. Setting the model
+            # window to 2x seq_len makes Grug's short-window mask a full window.
+            sliding_window=(
+                model.max_seq_len * 2 if attention_implementation == _GPU_FA4_THD_ATTENTION else model.sliding_window
+            ),
+        )
+
         batch_size = env_int("CANARY_BATCH_SIZE", 32)
         target_tokens = env_int("CANARY_TARGET_TOKENS", batch_size * model.max_seq_len * 50)
 
         data = slimpajama_6b_data()
+        if attention_implementation == _GPU_FA4_THD_ATTENTION:
+            data = dataclasses.replace(
+                data,
+                components={
+                    name: (
+                        dataclasses.replace(component, pack=1) if isinstance(component, DatasetComponent) else component
+                    )
+                    for name, component in data.components.items()
+                },
+            )
         resources = ResourceConfig.with_gpu(
             gpu_type,
             count=gpu_count,
@@ -142,16 +181,17 @@ def _build_step_from_env() -> ExecutorStep:
             disk="256g",
             replicas=gpu_replicas,
         )
-        name = f"canary-ferry-cw-{gpu_type.lower()}x{gpu_count}-r{gpu_replicas}-d{hidden_dim}"
-        wandb_group = f"canary-ferry-moe-gpu-{gpu_type.lower()}-r{gpu_replicas}"
-        wandb_tags = ["canary", "ferry", "grug", "moe", "gpu", gpu_type.lower()]
+        attention_tag = attention_implementation.removeprefix("gpu_")
+        name = f"canary-ferry-cw-{gpu_type.lower()}x{gpu_count}-r{gpu_replicas}-d{hidden_dim}-{attention_tag}"
+        wandb_group = f"canary-ferry-moe-gpu-{gpu_type.lower()}-r{gpu_replicas}-{attention_tag}"
+        wandb_tags = ["canary", "ferry", "grug", "moe", "gpu", gpu_type.lower(), f"d{hidden_dim}", attention_tag]
         eval_config = None
 
     num_steps = env_int("CANARY_STEPS", target_tokens // (batch_size * model.max_seq_len))
     if num_steps <= 0:
         raise ValueError(
             f"CANARY_STEPS={num_steps} invalid; set CANARY_STEPS or CANARY_TARGET_TOKENS high enough for "
-            f"batch_size={batch_size} x seq_len={GRUG_MOE_TRIAL_MODEL.max_seq_len}"
+            f"batch_size={batch_size} x seq_len={model.max_seq_len}"
         )
     if os.environ.get("CANARY_TRACKER", "wandb").lower() == "json_logger":
         tracker = JsonLoggerConfig(logger_name=os.environ.get("CANARY_JSON_LOGGER", "canary_ferry.metrics"))
 
@@ -69,6 +69,10 @@ def _batch_reshard(x: jax.Array) -> jax.Array:
     return reshard(x, _batch_spec())
 
 
+def _layer_attention_masks(mask: AttentionMask, *, sliding_window: int) -> tuple[AttentionMask, AttentionMask]:
+    return mask.with_sliding_window(sliding_window // 2), mask.with_sliding_window(sliding_window)
+
+
 @dataclass(frozen=True)
 class GrugModelConfig:
     """Hyperparameters for the grug MoE transformer.
@@ -518,9 +522,9 @@ def __call__(
         hidden = self.token_embed.at[token_ids].get(out_sharding=batch_spec)
         hidden = self.embed_gated_norm(self.embed_norm(hidden))
 
-        segment_ids = mask.segment_ids if isinstance(mask, AttentionMask) else None
-        short_mask = AttentionMask(is_causal=True, sliding_window=cfg.sliding_window // 2, segment_ids=segment_ids)
-        long_mask = AttentionMask(is_causal=True, sliding_window=cfg.sliding_window, segment_ids=segment_ids)
+        if not isinstance(mask, AttentionMask):
+            mask = AttentionMask.causal()
+        short_mask, long_mask = _layer_attention_masks(mask, sliding_window=cfg.sliding_window)
 
         moe_router_stats: list[dict[str, jax.Array]] = []
         for i, block in enumerate(self.blocks):
 
@@ -18,7 +18,8 @@
 from levanter.main.train_lm import TrainLmConfig
 from marin.execution.sweep import SweepTarget, claim_and_run
 from marin.execution.types import versioned
-from marin.training.training import extras_for_resources, resolve_training_env
+from marin.training.run_environment import extras_for_resources
+from marin.training.training import resolve_training_env
 
 from experiments.defaults import _run_training_on_worker, prepare_lm_train
 from experiments.evals.task_configs import CORE_TASKS
 
@@ -84,6 +84,9 @@ gpu = [
   "nvidia-cublas>=13.2.0.9; sys_platform == 'linux'",
   # Preserve the CoreWeave H100 all-to-all guard under CUDA 13.
   "nvidia-nccl-cu13>=2.28.3; sys_platform == 'linux'",
+  # FA4 CuTe/THD attention backends.
+  "nvidia-cutlass-dsl[cu13]>=4.5.2,<4.6; sys_platform == 'linux'",
+  "flash-attn-4[cu13]>=4.0.0b16,<4.1; sys_platform == 'linux'",
   # Optional raw Sonic MoE gather/combine backend. Keep jax-triton exact:
   # 0.3.0 is missing the CUDA backend API needed by our current stack.
   "jax-triton==0.3.1; sys_platform == 'linux'",