Skip to content

Commit 0204cbe

Browse files
committed
Merge remote-tracking branch 'origin/main' into agent/20260414-fix-4746
2 parents 1550e8c + 5eb5b60 commit 0204cbe

18 files changed

Lines changed: 920 additions & 1275 deletions

File tree

.github/workflows/marin-datakit-smoke.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
FERRY_STATUS_PATH: gs://marin-tmp-us-central1/ttl=1d/ci/datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}/ferry_run_status.json
2323
WANDB_ENTITY: marin-community
2424
WANDB_PROJECT: marin
25-
IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
25+
IRIS_CONFIG: lib/iris/examples/marin.yaml
2626
IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
2727

2828
steps:
@@ -69,6 +69,7 @@ jobs:
6969
JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
7070
job run --no-wait \
7171
--memory=2G --disk=4G --cpu=1 --extra=cpu \
72+
--priority production \
7273
-e SMOKE_RUN_ID "$SMOKE_RUN_ID" \
7374
-e FERRY_STATUS_PATH "$FERRY_STATUS_PATH" \
7475
-e WANDB_ENTITY "$WANDB_ENTITY" \
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
name: Zephyr - Shuffle Integration Tests
2+
3+
# Exercises the zephyr scatter/reduce shuffle at 10 GB across 4 scenarios
4+
# (uniform/skew × small/large items) by submitting iris jobs to marin-dev.
5+
# Each scenario runs as its own matrix leg and polls the iris job to a
6+
# terminal state.
7+
8+
on:
9+
# Manual only for now — baseline Parquet shuffle OOMs on skew scenarios,
10+
# so a scheduled run would fail until the zstd-chunk shuffle format lands.
11+
# Add a cron once the shuffle format change is in main.
12+
workflow_dispatch:
13+
inputs:
14+
num_input_shards:
15+
description: Input shard count (default 64)
16+
required: false
17+
default: '64'
18+
19+
permissions:
20+
contents: read
21+
id-token: write
22+
23+
jobs:
24+
shuffle-itest:
25+
runs-on: ubuntu-latest
26+
timeout-minutes: 180
27+
concurrency:
28+
group: zephyr-shuffle-itest-${{ matrix.scenario }}
29+
cancel-in-progress: true
30+
31+
strategy:
32+
fail-fast: false
33+
matrix:
34+
include:
35+
- scenario: uniform-small
36+
items_per_shard: '600000'
37+
item_bytes: '250'
38+
hot_shard_frac: '0.0'
39+
hot_key_pool: '0'
40+
- scenario: uniform-large
41+
items_per_shard: '160'
42+
item_bytes: '1000000'
43+
hot_shard_frac: '0.0'
44+
hot_key_pool: '0'
45+
- scenario: skew90-small
46+
items_per_shard: '600000'
47+
item_bytes: '250'
48+
hot_shard_frac: '0.9'
49+
hot_key_pool: '128'
50+
- scenario: skew90-large
51+
items_per_shard: '160'
52+
item_bytes: '1000000'
53+
hot_shard_frac: '0.9'
54+
hot_key_pool: '128'
55+
56+
env:
57+
RUN_ID: zephyr-shuffle-itest-${{ matrix.scenario }}-${{ github.run_id }}-${{ github.run_attempt }}
58+
IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
59+
IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
60+
NUM_INPUT_SHARDS: ${{ github.event.inputs.num_input_shards || '64' }}
61+
62+
steps:
63+
- name: Checkout code
64+
uses: actions/checkout@v4
65+
66+
- name: Set up Python 3.12
67+
uses: actions/setup-python@v5
68+
with:
69+
python-version: "3.12"
70+
71+
- name: Install uv
72+
uses: astral-sh/setup-uv@v7
73+
with:
74+
enable-cache: true
75+
76+
- name: Install dependencies
77+
run: uv sync --all-packages --extra=cpu --no-default-groups
78+
79+
- name: Authenticate to Google Cloud
80+
uses: google-github-actions/auth@v2
81+
with:
82+
credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}
83+
84+
- name: Set up Google Cloud SDK
85+
uses: google-github-actions/setup-gcloud@v2
86+
with:
87+
project_id: ${{ secrets.GCP_PROJECT_ID }}
88+
89+
- name: Set up OS Login SSH key
90+
run: |
91+
mkdir -p ~/.ssh
92+
ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.scenario }}"
93+
chmod 600 ~/.ssh/google_compute_engine
94+
gcloud compute os-login ssh-keys add \
95+
--key-file ~/.ssh/google_compute_engine.pub \
96+
--impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
97+
--ttl=6h
98+
99+
- name: Submit shuffle benchmark
100+
id: submit
101+
shell: bash -l {0}
102+
run: |
103+
JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
104+
job run --no-wait --priority production \
105+
--memory=2G --disk=8G --cpu=1 --extra=cpu \
106+
-e SMOKE_RUN_ID "$RUN_ID" \
107+
-- python lib/zephyr/tests/benchmark_shuffle.py \
108+
--num-input-shards "$NUM_INPUT_SHARDS" \
109+
--items-per-shard "${{ matrix.items_per_shard }}" \
110+
--item-bytes "${{ matrix.item_bytes }}" \
111+
--num-keys 50000 \
112+
--max-workers 4 --worker-cpu 1 --worker-ram 8g \
113+
--hot-shard-frac "${{ matrix.hot_shard_frac }}" \
114+
--hot-key-pool "${{ matrix.hot_key_pool }}" \
115+
--repeat 3 \
116+
--label "$RUN_ID")
117+
echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
118+
echo "Submitted job: $JOB_ID"
119+
120+
- name: Wait for shuffle benchmark
121+
shell: bash -l {0}
122+
run: |
123+
JOB_ID="${{ steps.submit.outputs.job_id }}"
124+
echo "Polling job status: $JOB_ID"
125+
while true; do
126+
STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
127+
job list --json --prefix "$JOB_ID" \
128+
| jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
129+
case "$STATE" in
130+
JOB_STATE_SUCCEEDED)
131+
echo "Job succeeded"
132+
exit 0
133+
;;
134+
JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
135+
echo "$(date -u +%H:%M:%S) Job state: $STATE"
136+
sleep 30
137+
;;
138+
"")
139+
echo "Job not found: $JOB_ID"
140+
exit 1
141+
;;
142+
*)
143+
echo "Job finished with state: $STATE"
144+
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
145+
job list --json --prefix "$JOB_ID" \
146+
| jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
147+
exit 1
148+
;;
149+
esac
150+
done
151+
152+
- name: Print benchmark results
153+
if: success()
154+
shell: bash -l {0}
155+
run: |
156+
JOB_ID="${{ steps.submit.outputs.job_id }}"
157+
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
158+
job logs "$JOB_ID" --max-lines 200 2>/dev/null \
159+
| grep "RESULT:" || echo "No RESULT lines found"
160+
161+
- name: Capture failure diagnostics
162+
if: failure()
163+
shell: bash -l {0}
164+
run: |
165+
JOB_ID="${{ steps.submit.outputs.job_id }}"
166+
echo "=== Job summary ==="
167+
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
168+
job summary "$JOB_ID" 2>/dev/null || true
169+
echo "=== Recent logs ==="
170+
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
171+
job logs "$JOB_ID" --max-lines 100 2>/dev/null | tail -60 || true
172+
173+
- name: Remove OS Login SSH key
174+
if: always()
175+
run: |
176+
gcloud compute os-login ssh-keys remove \
177+
--impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
178+
--key-file ~/.ssh/google_compute_engine.pub || true

.pyrefly-baseline.json

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,54 +1824,6 @@
18241824
"concise_description": "Redundant cast: `Mapping[str, Any]` is the same type as `Mapping[str, Any]`",
18251825
"severity": "warn"
18261826
},
1827-
{
1828-
"line": 76,
1829-
"column": 9,
1830-
"stop_line": 103,
1831-
"stop_column": 22,
1832-
"path": "lib/marin/src/marin/rl/math_utils.py",
1833-
"code": -2,
1834-
"name": "bad-assignment",
1835-
"description": "`int` is not assignable to `int` (caused by inconsistent types when breaking cycles)",
1836-
"concise_description": "`int` is not assignable to `int` (caused by inconsistent types when breaking cycles)",
1837-
"severity": "error"
1838-
},
1839-
{
1840-
"line": 89,
1841-
"column": 25,
1842-
"stop_line": 89,
1843-
"stop_column": 57,
1844-
"path": "lib/marin/src/marin/rl/math_utils.py",
1845-
"code": -2,
1846-
"name": "unsupported-operation",
1847-
"description": "`+` is not supported between `None` and `Literal[1]`\n Argument `None` is not assignable to parameter `value` with type `int` in function `int.__radd__`",
1848-
"concise_description": "`+` is not supported between `None` and `Literal[1]`",
1849-
"severity": "error"
1850-
},
1851-
{
1852-
"line": 550,
1853-
"column": 13,
1854-
"stop_line": 550,
1855-
"stop_column": 21,
1856-
"path": "lib/marin/src/marin/rl/math_utils.py",
1857-
"code": -2,
1858-
"name": "bad-assignment",
1859-
"description": "`float` is not assignable to variable `x` with type `str`",
1860-
"concise_description": "`float` is not assignable to variable `x` with type `str`",
1861-
"severity": "error"
1862-
},
1863-
{
1864-
"line": 586,
1865-
"column": 16,
1866-
"stop_line": 586,
1867-
"stop_column": 20,
1868-
"path": "lib/marin/src/marin/rl/math_utils.py",
1869-
"code": -2,
1870-
"name": "bad-return",
1871-
"description": "Returned type `None` is not assignable to declared return type `str`",
1872-
"concise_description": "Returned type `None` is not assignable to declared return type `str`",
1873-
"severity": "error"
1874-
},
18751827
{
18761828
"line": 199,
18771829
"column": 15,
@@ -2125,4 +2077,4 @@
21252077
"severity": "error"
21262078
}
21272079
]
2128-
}
2080+
}

infra/status-page/web/src/components/FerryPanel.tsx

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,51 @@ function runAppearance(run: FerryRun): { className: string; style?: CSSPropertie
2323
}
2424
}
2525

26+
// Flag runs whose wall time is at least SLOW_RUN_STDDEV_THRESHOLD standard
27+
// deviations longer than the mean of the preceding successful runs. Uses up
28+
// to SLOW_RUN_MAX samples, but requires at least SLOW_RUN_MIN so the baseline
29+
// doesn't collapse to noise on new/sparse workflows. Successful only —
30+
// failures/cancels/timeouts have unrepresentative wall times (early exits,
31+
// hangs) and would poison the baseline. history[0] is the most recent run,
32+
// so "prior" means higher indices.
33+
const SLOW_RUN_MIN = 3;
34+
const SLOW_RUN_MAX = 7;
35+
const SLOW_RUN_STDDEV_THRESHOLD = 1;
36+
37+
interface SlowRunBaseline {
38+
threshold: number;
39+
sampleSize: number;
40+
}
41+
42+
function slowRunBaseline(history: FerryRun[], index: number): SlowRunBaseline | null {
43+
const priorDurations: number[] = [];
44+
for (let j = index + 1; j < history.length && priorDurations.length < SLOW_RUN_MAX; j++) {
45+
const prev = history[j];
46+
if (prev.conclusion === "success" && prev.durationSeconds !== null) {
47+
priorDurations.push(prev.durationSeconds);
48+
}
49+
}
50+
if (priorDurations.length < SLOW_RUN_MIN) return null;
51+
const mean = priorDurations.reduce((a, b) => a + b, 0) / priorDurations.length;
52+
const variance =
53+
priorDurations.reduce((s, x) => s + (x - mean) ** 2, 0) / priorDurations.length;
54+
const stddev = Math.sqrt(variance);
55+
// σ=0 (all prior durations identical) still yields a valid threshold at
56+
// the mean — anything strictly slower than a perfectly stable baseline is
57+
// genuinely anomalous.
58+
return {
59+
threshold: mean + SLOW_RUN_STDDEV_THRESHOLD * stddev,
60+
sampleSize: priorDurations.length,
61+
};
62+
}
63+
64+
function isSlowRun(history: FerryRun[], index: number): boolean {
65+
const run = history[index];
66+
if (run.durationSeconds === null) return false;
67+
const baseline = slowRunBaseline(history, index);
68+
return baseline !== null && run.durationSeconds > baseline.threshold;
69+
}
70+
2671
function formatDuration(seconds: number | null): string {
2772
if (seconds === null) return "—";
2873
if (seconds < 60) return `${seconds}s`;
@@ -96,18 +141,42 @@ function WorkflowCard({ wf }: { wf: FerryWorkflowStatus }) {
96141
so all 30 fit on a ~340px phone content area without
97142
wrapping to a second row. */}
98143
<div className="mt-3 flex gap-px sm:gap-1">
99-
{wf.history.map((run) => {
144+
{wf.history.map((run, i) => {
100145
const a = runAppearance(run);
146+
const slow = isSlowRun(wf.history, i);
147+
const baseline = slow ? slowRunBaseline(wf.history, i) : null;
101148
return (
102149
<a
103150
key={run.id}
104151
href={run.url}
105152
target="_blank"
106153
rel="noreferrer"
107-
title={`${run.shaShort} · ${run.conclusion ?? run.status} · ${formatRelative(run.startedAt)}`}
108-
className={`h-5 w-2 rounded-sm sm:w-2.5 ${a.className} hover:ring-2 hover:ring-slate-400`}
154+
className={`group relative h-5 w-2 rounded-sm sm:w-2.5 ${a.className} hover:ring-2 hover:ring-slate-400`}
109155
style={a.style}
110-
/>
156+
>
157+
{slow && (
158+
<span
159+
aria-label="slow run"
160+
className="pointer-events-none absolute -right-0.5 -top-1 font-bold leading-none text-amber-300"
161+
style={{ fontSize: "10px", textShadow: "0 0 2px #0f172a, 0 0 2px #0f172a" }}
162+
>
163+
!
164+
</span>
165+
)}
166+
<div className="pointer-events-none absolute bottom-full left-1/2 z-10 mb-2 hidden -translate-x-1/2 whitespace-nowrap rounded border border-slate-700 bg-slate-950/95 px-2 py-1 text-xs text-slate-200 shadow-lg group-hover:block">
167+
<div className="font-mono text-slate-300">{run.shaShort}</div>
168+
<div className="text-slate-400">
169+
{run.conclusion ?? run.status} · {formatRelative(run.startedAt)}
170+
</div>
171+
<div>wall time: {formatDuration(run.durationSeconds)}</div>
172+
{slow && baseline !== null && (
173+
<div className="text-amber-300">
174+
slow · prior {baseline.sampleSize} successful runs mean+1σ ≈{" "}
175+
{formatDuration(Math.round(baseline.threshold))}
176+
</div>
177+
)}
178+
</div>
179+
</a>
111180
);
112181
})}
113182
</div>

lib/iris/src/iris/cluster/constraints.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,11 @@ def get_device_type_enum(device: job_pb2.DeviceConfig) -> DeviceType:
6767

6868

6969
def get_device_type(device: job_pb2.DeviceConfig) -> str:
70-
"""Extract device type from DeviceConfig."""
71-
if device.HasField("cpu"):
72-
return "cpu"
73-
if device.HasField("gpu"):
74-
return "gpu"
75-
if device.HasField("tpu"):
76-
return "tpu"
77-
return "cpu"
70+
"""Extract device type string from DeviceConfig.
71+
72+
Delegates to get_device_type_enum() to avoid duplicating the dispatch logic.
73+
"""
74+
return get_device_type_enum(device).value
7875

7976

8077
def get_device_variant(device: job_pb2.DeviceConfig) -> str | None:

0 commit comments

Comments
 (0)