marin-community
diff --git a/‎.github/workflows/marin-datakit-smoke.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/marin-datakit-smoke.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/zephyr-shuffle-itest.yaml‎
Lines changed: 178 additions & 0 deletions b/‎.github/workflows/zephyr-shuffle-itest.yaml‎
Lines changed: 178 additions & 0 deletions
diff --git a/‎.pyrefly-baseline.json‎
Lines changed: 1 addition & 49 deletions b/‎.pyrefly-baseline.json‎
Lines changed: 1 addition & 49 deletions
diff --git a/‎infra/status-page/web/src/components/FerryPanel.tsx‎
Lines changed: 73 additions & 4 deletions b/‎infra/status-page/web/src/components/FerryPanel.tsx‎
Lines changed: 73 additions & 4 deletions
diff --git a/‎lib/iris/src/iris/cluster/constraints.py‎
Lines changed: 5 additions & 8 deletions b/‎lib/iris/src/iris/cluster/constraints.py‎
Lines changed: 5 additions & 8 deletions
@@ -22,7 +22,7 @@ jobs:
       FERRY_STATUS_PATH: gs://marin-tmp-us-central1/ttl=1d/ci/datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}/ferry_run_status.json
       WANDB_ENTITY: marin-community
       WANDB_PROJECT: marin
-      IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
+      IRIS_CONFIG: lib/iris/examples/marin.yaml
       IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
 
     steps:
@@ -69,6 +69,7 @@ jobs:
           JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
             job run --no-wait \
             --memory=2G --disk=4G --cpu=1 --extra=cpu \
+            --priority production \
             -e SMOKE_RUN_ID "$SMOKE_RUN_ID" \
             -e FERRY_STATUS_PATH "$FERRY_STATUS_PATH" \
             -e WANDB_ENTITY "$WANDB_ENTITY" \
 
@@ -0,0 +1,178 @@
+name: Zephyr - Shuffle Integration Tests
+
+# Exercises the zephyr scatter/reduce shuffle at 10 GB across 4 scenarios
+# (uniform/skew × small/large items) by submitting iris jobs to marin-dev.
+# Each scenario runs as its own matrix leg and polls the iris job to a
+# terminal state.
+
+on:
+  # Manual only for now — baseline Parquet shuffle OOMs on skew scenarios,
+  # so a scheduled run would fail until the zstd-chunk shuffle format lands.
+  # Add a cron once the shuffle format change is in main.
+  workflow_dispatch:
+    inputs:
+      num_input_shards:
+        description: Input shard count (default 64)
+        required: false
+        default: '64'
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  shuffle-itest:
+    runs-on: ubuntu-latest
+    timeout-minutes: 180
+    concurrency:
+      group: zephyr-shuffle-itest-${{ matrix.scenario }}
+      cancel-in-progress: true
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - scenario: uniform-small
+            items_per_shard: '600000'
+            item_bytes: '250'
+            hot_shard_frac: '0.0'
+            hot_key_pool: '0'
+          - scenario: uniform-large
+            items_per_shard: '160'
+            item_bytes: '1000000'
+            hot_shard_frac: '0.0'
+            hot_key_pool: '0'
+          - scenario: skew90-small
+            items_per_shard: '600000'
+            item_bytes: '250'
+            hot_shard_frac: '0.9'
+            hot_key_pool: '128'
+          - scenario: skew90-large
+            items_per_shard: '160'
+            item_bytes: '1000000'
+            hot_shard_frac: '0.9'
+            hot_key_pool: '128'
+
+    env:
+      RUN_ID: zephyr-shuffle-itest-${{ matrix.scenario }}-${{ github.run_id }}-${{ github.run_attempt }}
+      IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
+      IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
+      NUM_INPUT_SHARDS: ${{ github.event.inputs.num_input_shards || '64' }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv sync --all-packages --extra=cpu --no-default-groups
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v2
+        with:
+          credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}
+
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
+        with:
+          project_id: ${{ secrets.GCP_PROJECT_ID }}
+
+      - name: Set up OS Login SSH key
+        run: |
+          mkdir -p ~/.ssh
+          ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.scenario }}"
+          chmod 600 ~/.ssh/google_compute_engine
+          gcloud compute os-login ssh-keys add \
+            --key-file ~/.ssh/google_compute_engine.pub \
+            --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
+            --ttl=6h
+
+      - name: Submit shuffle benchmark
+        id: submit
+        shell: bash -l {0}
+        run: |
+          JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job run --no-wait --priority production \
+            --memory=2G --disk=8G --cpu=1 --extra=cpu \
+            -e SMOKE_RUN_ID "$RUN_ID" \
+            -- python lib/zephyr/tests/benchmark_shuffle.py \
+                 --num-input-shards "$NUM_INPUT_SHARDS" \
+                 --items-per-shard "${{ matrix.items_per_shard }}" \
+                 --item-bytes "${{ matrix.item_bytes }}" \
+                 --num-keys 50000 \
+                 --max-workers 4 --worker-cpu 1 --worker-ram 8g \
+                 --hot-shard-frac "${{ matrix.hot_shard_frac }}" \
+                 --hot-key-pool "${{ matrix.hot_key_pool }}" \
+                 --repeat 3 \
+                 --label "$RUN_ID")
+          echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
+          echo "Submitted job: $JOB_ID"
+
+      - name: Wait for shuffle benchmark
+        shell: bash -l {0}
+        run: |
+          JOB_ID="${{ steps.submit.outputs.job_id }}"
+          echo "Polling job status: $JOB_ID"
+          while true; do
+            STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+              job list --json --prefix "$JOB_ID" \
+              | jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
+            case "$STATE" in
+              JOB_STATE_SUCCEEDED)
+                echo "Job succeeded"
+                exit 0
+                ;;
+              JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
+                echo "$(date -u +%H:%M:%S) Job state: $STATE"
+                sleep 30
+                ;;
+              "")
+                echo "Job not found: $JOB_ID"
+                exit 1
+                ;;
+              *)
+                echo "Job finished with state: $STATE"
+                .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+                  job list --json --prefix "$JOB_ID" \
+                  | jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
+                exit 1
+                ;;
+            esac
+          done
+
+      - name: Print benchmark results
+        if: success()
+        shell: bash -l {0}
+        run: |
+          JOB_ID="${{ steps.submit.outputs.job_id }}"
+          .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job logs "$JOB_ID" --max-lines 200 2>/dev/null \
+            | grep "RESULT:" || echo "No RESULT lines found"
+
+      - name: Capture failure diagnostics
+        if: failure()
+        shell: bash -l {0}
+        run: |
+          JOB_ID="${{ steps.submit.outputs.job_id }}"
+          echo "=== Job summary ==="
+          .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job summary "$JOB_ID" 2>/dev/null || true
+          echo "=== Recent logs ==="
+          .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job logs "$JOB_ID" --max-lines 100 2>/dev/null | tail -60 || true
+
+      - name: Remove OS Login SSH key
+        if: always()
+        run: |
+          gcloud compute os-login ssh-keys remove \
+            --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
+            --key-file ~/.ssh/google_compute_engine.pub || true
@@ -1824,54 +1824,6 @@
       "concise_description": "Redundant cast: `Mapping[str, Any]` is the same type as `Mapping[str, Any]`",
       "severity": "warn"
     },
-    {
-      "line": 76,
-      "column": 9,
-      "stop_line": 103,
-      "stop_column": 22,
-      "path": "lib/marin/src/marin/rl/math_utils.py",
-      "code": -2,
-      "name": "bad-assignment",
-      "description": "`int` is not assignable to `int` (caused by inconsistent types when breaking cycles)",
-      "concise_description": "`int` is not assignable to `int` (caused by inconsistent types when breaking cycles)",
-      "severity": "error"
-    },
-    {
-      "line": 89,
-      "column": 25,
-      "stop_line": 89,
-      "stop_column": 57,
-      "path": "lib/marin/src/marin/rl/math_utils.py",
-      "code": -2,
-      "name": "unsupported-operation",
-      "description": "`+` is not supported between `None` and `Literal[1]`\n  Argument `None` is not assignable to parameter `value` with type `int` in function `int.__radd__`",
-      "concise_description": "`+` is not supported between `None` and `Literal[1]`",
-      "severity": "error"
-    },
-    {
-      "line": 550,
-      "column": 13,
-      "stop_line": 550,
-      "stop_column": 21,
-      "path": "lib/marin/src/marin/rl/math_utils.py",
-      "code": -2,
-      "name": "bad-assignment",
-      "description": "`float` is not assignable to variable `x` with type `str`",
-      "concise_description": "`float` is not assignable to variable `x` with type `str`",
-      "severity": "error"
-    },
-    {
-      "line": 586,
-      "column": 16,
-      "stop_line": 586,
-      "stop_column": 20,
-      "path": "lib/marin/src/marin/rl/math_utils.py",
-      "code": -2,
-      "name": "bad-return",
-      "description": "Returned type `None` is not assignable to declared return type `str`",
-      "concise_description": "Returned type `None` is not assignable to declared return type `str`",
-      "severity": "error"
-    },
     {
       "line": 199,
       "column": 15,
@@ -2125,4 +2077,4 @@
       "severity": "error"
     }
   ]
-}
+}
@@ -23,6 +23,51 @@ function runAppearance(run: FerryRun): { className: string; style?: CSSPropertie
   }
 }
 
+// Flag runs whose wall time is at least SLOW_RUN_STDDEV_THRESHOLD standard
+// deviations longer than the mean of the preceding successful runs. Uses up
+// to SLOW_RUN_MAX samples, but requires at least SLOW_RUN_MIN so the baseline
+// doesn't collapse to noise on new/sparse workflows. Successful only —
+// failures/cancels/timeouts have unrepresentative wall times (early exits,
+// hangs) and would poison the baseline. history[0] is the most recent run,
+// so "prior" means higher indices.
+const SLOW_RUN_MIN = 3;
+const SLOW_RUN_MAX = 7;
+const SLOW_RUN_STDDEV_THRESHOLD = 1;
+
+interface SlowRunBaseline {
+  threshold: number;
+  sampleSize: number;
+}
+
+function slowRunBaseline(history: FerryRun[], index: number): SlowRunBaseline | null {
+  const priorDurations: number[] = [];
+  for (let j = index + 1; j < history.length && priorDurations.length < SLOW_RUN_MAX; j++) {
+    const prev = history[j];
+    if (prev.conclusion === "success" && prev.durationSeconds !== null) {
+      priorDurations.push(prev.durationSeconds);
+    }
+  }
+  if (priorDurations.length < SLOW_RUN_MIN) return null;
+  const mean = priorDurations.reduce((a, b) => a + b, 0) / priorDurations.length;
+  const variance =
+    priorDurations.reduce((s, x) => s + (x - mean) ** 2, 0) / priorDurations.length;
+  const stddev = Math.sqrt(variance);
+  // σ=0 (all prior durations identical) still yields a valid threshold at
+  // the mean — anything strictly slower than a perfectly stable baseline is
+  // genuinely anomalous.
+  return {
+    threshold: mean + SLOW_RUN_STDDEV_THRESHOLD * stddev,
+    sampleSize: priorDurations.length,
+  };
+}
+
+function isSlowRun(history: FerryRun[], index: number): boolean {
+  const run = history[index];
+  if (run.durationSeconds === null) return false;
+  const baseline = slowRunBaseline(history, index);
+  return baseline !== null && run.durationSeconds > baseline.threshold;
+}
+
 function formatDuration(seconds: number | null): string {
   if (seconds === null) return "—";
   if (seconds < 60) return `${seconds}s`;
@@ -96,18 +141,42 @@ function WorkflowCard({ wf }: { wf: FerryWorkflowStatus }) {
               so all 30 fit on a ~340px phone content area without
               wrapping to a second row. */}
           <div className="mt-3 flex gap-px sm:gap-1">
-            {wf.history.map((run) => {
+            {wf.history.map((run, i) => {
               const a = runAppearance(run);
+              const slow = isSlowRun(wf.history, i);
+              const baseline = slow ? slowRunBaseline(wf.history, i) : null;
               return (
                 <a
                   key={run.id}
                   href={run.url}
                   target="_blank"
                   rel="noreferrer"
-                  title={`${run.shaShort} · ${run.conclusion ?? run.status} · ${formatRelative(run.startedAt)}`}
-                  className={`h-5 w-2 rounded-sm sm:w-2.5 ${a.className} hover:ring-2 hover:ring-slate-400`}
+                  className={`group relative h-5 w-2 rounded-sm sm:w-2.5 ${a.className} hover:ring-2 hover:ring-slate-400`}
                   style={a.style}
-                />
+                >
+                  {slow && (
+                    <span
+                      aria-label="slow run"
+                      className="pointer-events-none absolute -right-0.5 -top-1 font-bold leading-none text-amber-300"
+                      style={{ fontSize: "10px", textShadow: "0 0 2px #0f172a, 0 0 2px #0f172a" }}
+                    >
+                      !
+                    </span>
+                  )}
+                  <div className="pointer-events-none absolute bottom-full left-1/2 z-10 mb-2 hidden -translate-x-1/2 whitespace-nowrap rounded border border-slate-700 bg-slate-950/95 px-2 py-1 text-xs text-slate-200 shadow-lg group-hover:block">
+                    <div className="font-mono text-slate-300">{run.shaShort}</div>
+                    <div className="text-slate-400">
+                      {run.conclusion ?? run.status} · {formatRelative(run.startedAt)}
+                    </div>
+                    <div>wall time: {formatDuration(run.durationSeconds)}</div>
+                    {slow && baseline !== null && (
+                      <div className="text-amber-300">
+                        slow · prior {baseline.sampleSize} successful runs mean+1σ ≈{" "}
+                        {formatDuration(Math.round(baseline.threshold))}
+                      </div>
+                    )}
+                  </div>
+                </a>
               );
             })}
           </div>
 
@@ -67,14 +67,11 @@ def get_device_type_enum(device: job_pb2.DeviceConfig) -> DeviceType:
 
 
 def get_device_type(device: job_pb2.DeviceConfig) -> str:
-    """Extract device type from DeviceConfig."""
-    if device.HasField("cpu"):
-        return "cpu"
-    if device.HasField("gpu"):
-        return "gpu"
-    if device.HasField("tpu"):
-        return "tpu"
-    return "cpu"
+    """Extract device type string from DeviceConfig.
+
+    Delegates to get_device_type_enum() to avoid duplicating the dispatch logic.
+    """
+    return get_device_type_enum(device).value
 
 
 def get_device_variant(device: job_pb2.DeviceConfig) -> str | None: