Skip to content

Commit c928d1c

Browse files
authored
Merge branch 'main' into grug_moe_heuristic
2 parents ceff1bb + 41825e6 commit c928d1c

File tree

278 files changed

+29188
-5030
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

278 files changed

+29188
-5030
lines changed

.agents/skills/fix-docs/SKILL.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
---
2+
name: fix-docs
3+
description: Fix markdown docs in `lib/iris`, `lib/zephyr`, and `lib/fray` to align with Marin's agent-doc principles. Use when asked to repair, modernize, or de-rot docs in those directories.
4+
---
5+
16
Your task is to fix the markdown docs within `lib/iris`, `lib/zephyr` and `lib/fray` so that they maximally comply with the principles below. Do NOT fix docs outside of the aforementioned directories.
27

38
Your output: You will dispatch sub-agents that will (1) thoroughly parse the code and the docs and (2) make all the documentation changes that are deemed appropriate, locally. You will commit the changes locally into a single commit, inform the user of the commit, and summarize the changes you made. Under no circumstances should you push any commit to the repo without explicit approval from the user.

.github/workflows/iris-cloud-smoke-gcp.yaml

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,49 @@ jobs:
171171
if: failure()
172172
env:
173173
LOG_DIR: ${{ github.workspace }}/iris-cloud-logs
174+
LABEL_PREFIX: ${{ steps.label.outputs.prefix }}
175+
PROJECT: ${{ secrets.GCP_PROJECT_ID }}
174176
run: |
175177
mkdir -p "$LOG_DIR"
178+
MANAGED_LABEL="iris-${LABEL_PREFIX}-managed"
179+
CONTROLLER_LABEL="iris-${LABEL_PREFIX}-controller"
180+
181+
# Always try to grab docker/host logs directly from every VM we
182+
# spun up. This works even when the controller never became
183+
# reachable (e.g. tunnel never opened, controller crashed during
184+
# boot), which is exactly the case where these logs matter most.
185+
gcloud compute instances list \
186+
--project="$PROJECT" \
187+
--filter="labels.${MANAGED_LABEL}=true OR labels.${CONTROLLER_LABEL}=true" \
188+
--format="csv[no-heading](name,zone,labels.list())" 2>/dev/null \
189+
| while IFS=, read -r name zone labels; do
190+
[ -z "$name" ] && continue
191+
role=worker
192+
case "$labels" in *"$CONTROLLER_LABEL"*) role=controller ;; esac
193+
echo "Fetching host logs from $role $name ($zone)"
194+
out="$LOG_DIR/${role}-${name}"
195+
gcloud compute ssh "$name" \
196+
--project="$PROJECT" --zone="$zone" \
197+
--impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
198+
--ssh-key-file ~/.ssh/google_compute_engine \
199+
--quiet \
200+
--command '
201+
set +e
202+
echo "=== docker ps -a ==="
203+
sudo docker ps -a
204+
for cid in $(sudo docker ps -aq); do
205+
echo "=== docker logs $cid ==="
206+
sudo docker logs --timestamps --tail 5000 "$cid" 2>&1
207+
done
208+
echo "=== startup script ==="
209+
sudo journalctl -u google-startup-scripts.service --no-pager 2>&1 | tail -n 2000
210+
echo "=== kernel/cloud-init ==="
211+
sudo journalctl -u cloud-final.service --no-pager 2>&1 | tail -n 500
212+
' > "${out}.log" 2>&1 || echo "ssh to $name failed (see ${out}.log)"
213+
done
214+
176215
if [ -z "$IRIS_CONTROLLER_URL" ]; then
177-
echo "No controller URL, skipping log collection"
216+
echo "No controller URL, skipping RPC-based log collection"
178217
exit 0
179218
fi
180219
cd lib/iris

.github/workflows/iris-coreweave-ci.yaml

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ jobs:
104104
run: |
105105
cd lib/iris && uv run --group dev iris -v \
106106
--config=examples/coreweave-ci.yaml \
107-
cluster start
107+
cluster start --fresh
108108
109109
- name: Run integration tests
110110
env:
@@ -185,9 +185,14 @@ jobs:
185185
186186
- name: Capture failure diagnostics
187187
if: failure()
188+
env:
189+
LOG_DIR: ${{ github.workspace }}/iris-cw-logs
188190
run: |
189191
export KUBECONFIG=~/.kube/coreweave-iris
190-
echo "=== Controller logs ==="
192+
mkdir -p "$LOG_DIR"
193+
194+
# Stream to the GH Actions log for quick triage…
195+
echo "=== Controller logs (tail) ==="
191196
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=500 || true
192197
echo "=== Controller pod describe ==="
193198
kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller || true
@@ -196,6 +201,36 @@ jobs:
196201
echo "=== Warning events ==="
197202
kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true
198203
204+
# …and also persist per-pod logs + describe so failures in worker
205+
# containers are recoverable from the uploaded artifact, not just
206+
# the controller's view.
207+
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers \
208+
> "$LOG_DIR/controller.log" 2>&1 || true
209+
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers --previous \
210+
> "$LOG_DIR/controller-previous.log" 2>&1 || true
211+
kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller \
212+
> "$LOG_DIR/controller-describe.txt" 2>&1 || true
213+
214+
for pod in $(kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" -o name 2>/dev/null); do
215+
safe=$(echo "$pod" | tr '/' '-')
216+
kubectl -n "$IRIS_NAMESPACE" logs "$pod" --tail=-1 --all-containers \
217+
> "$LOG_DIR/${safe}.log" 2>&1 || true
218+
kubectl -n "$IRIS_NAMESPACE" describe "$pod" \
219+
> "$LOG_DIR/${safe}-describe.txt" 2>&1 || true
220+
done
221+
222+
kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' \
223+
> "$LOG_DIR/events.txt" 2>&1 || true
224+
225+
- name: Upload failure diagnostics
226+
if: failure()
227+
uses: actions/upload-artifact@v4
228+
with:
229+
name: iris-cw-ci-logs
230+
path: iris-cw-logs/
231+
retention-days: 14
232+
if-no-files-found: ignore
233+
199234
- name: Set commit status to result
200235
if: always() && github.event_name == 'issue_comment'
201236
env:

.github/workflows/iris-dev-restart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ name: Iris - Dev Cluster Daily Restart
22

33
on:
44
schedule:
5-
# Daily at 06:00 UTC
6-
- cron: "0 6 * * *"
5+
# Daily at 05:00 UTC — staggered before canary ferry (06:00 UTC)
6+
- cron: "0 5 * * *"
77
workflow_dispatch:
88

99
permissions:

.github/workflows/marin-canary-ferry-cw.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ jobs:
5858
enable-cache: true
5959

6060
- name: Install dependencies
61-
run: uv sync --all-packages --extra=cpu --no-default-groups
61+
run: uv sync --all-packages --extra=cpu --extra=controller --no-default-groups
6262

6363
- name: Write CoreWeave kubeconfig
6464
run: |
@@ -195,7 +195,7 @@ jobs:
195195
Read .agents/skills/canary-triage/SKILL.md and follow it.
196196
claude_args: |
197197
--model opus
198-
--max-turns 50
198+
--max-turns 500
199199
--allowedTools "Bash(kubectl:*),Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
200200
env:
201201
CANARY_LANE: gpu

.github/workflows/marin-canary-ferry.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ jobs:
165165
Read .agents/skills/canary-triage/SKILL.md and follow it.
166166
claude_args: |
167167
--model opus
168-
--max-turns 50
168+
--max-turns 500
169169
--allowedTools "Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
170170
env:
171171
CANARY_LANE: tpu

.github/workflows/marin-datakit-smoke.yaml

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
cancel-in-progress: true
2020
env:
2121
SMOKE_RUN_ID: datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}
22-
# MARIN_PREFIX is defaulted by the ferry entrypoint to marin_temp_bucket(ttl_days=1).
22+
FERRY_STATUS_PATH: gs://marin-tmp-us-central1/ttl=1d/ci/datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}/ferry_run_status.json
2323
WANDB_ENTITY: marin-community
2424
WANDB_PROJECT: marin
2525
IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
@@ -70,6 +70,7 @@ jobs:
7070
job run --no-wait \
7171
--memory=2G --disk=4G --cpu=1 --extra=cpu \
7272
-e SMOKE_RUN_ID "$SMOKE_RUN_ID" \
73+
-e FERRY_STATUS_PATH "$FERRY_STATUS_PATH" \
7374
-e WANDB_ENTITY "$WANDB_ENTITY" \
7475
-e WANDB_PROJECT "$WANDB_PROJECT" \
7576
-e WANDB_API_KEY "$WANDB_API_KEY" \
@@ -113,12 +114,24 @@ jobs:
113114
esac
114115
done
115116
117+
- name: Read ferry status
118+
id: ferry_status
119+
shell: bash -l {0}
120+
run: |
121+
PREFIX=$(.venv/bin/python -c "
122+
import json
123+
from rigging.filesystem import url_to_fs
124+
fs, _ = url_to_fs('$FERRY_STATUS_PATH')
125+
with fs.open('$FERRY_STATUS_PATH') as f:
126+
print(json.load(f)['marin_prefix'])
127+
")
128+
echo "marin_prefix=$PREFIX" >> "$GITHUB_OUTPUT"
129+
echo "Ferry output prefix: $PREFIX"
130+
116131
- name: Validate datakit smoke outputs
117132
shell: bash -l {0}
118133
env:
119-
SMOKE_RUN_ID: ${{ env.SMOKE_RUN_ID }}
120-
# MARIN_PREFIX intentionally unset — validate script defaults via marin_temp_bucket,
121-
# matching the ferry entrypoint default.
134+
MARIN_PREFIX: ${{ steps.ferry_status.outputs.marin_prefix }}
122135
run: .venv/bin/python scripts/datakit/validate_ferry_outputs.py
123136

124137
- name: Capture failure diagnostics
@@ -143,7 +156,7 @@ jobs:
143156
Read .agents/skills/canary-triage/SKILL.md and follow it.
144157
claude_args: |
145158
--model opus
146-
--max-turns 50
159+
--max-turns 500
147160
--allowedTools "Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
148161
env:
149162
CANARY_LANE: datakit-smoke
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
name: Marin - Infra Dashboard
2+
on:
3+
push:
4+
branches:
5+
- main
6+
pull_request:
7+
8+
9+
concurrency:
10+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
11+
cancel-in-progress: true
12+
13+
jobs:
14+
changes:
15+
runs-on: ubuntu-latest
16+
permissions:
17+
contents: read
18+
pull-requests: read
19+
outputs:
20+
should_run: ${{ steps.filter.outputs.relevant }}
21+
steps:
22+
- uses: actions/checkout@v4
23+
- uses: dorny/paths-filter@v3
24+
id: filter
25+
with:
26+
filters: |
27+
relevant:
28+
- 'infra/status-page/**'
29+
- '.github/workflows/marin-infra-dashboard.yaml'
30+
31+
build:
32+
needs: changes
33+
if: needs.changes.outputs.should_run == 'true'
34+
runs-on: ubuntu-latest
35+
timeout-minutes: 10
36+
defaults:
37+
run:
38+
working-directory: infra/status-page
39+
steps:
40+
- name: Checkout code
41+
uses: actions/checkout@v4
42+
43+
- name: Set up Node.js 20
44+
uses: actions/setup-node@v4
45+
with:
46+
node-version: '20'
47+
cache: 'npm'
48+
cache-dependency-path: infra/status-page/package-lock.json
49+
50+
- name: Install dependencies
51+
run: npm ci
52+
53+
- name: Lint
54+
run: npm run lint
55+
56+
- name: Typecheck
57+
run: npm run typecheck
58+
59+
- name: Build
60+
run: npm run build

experiments/defaults.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ def default_tokenize(
206206
*,
207207
sample_count: int | VersionedValue[int] | None = None,
208208
is_validation: bool = False,
209+
levanter_batch_size: int | None = None,
210+
resources: ResourceConfig | None = None,
211+
worker_resources: ResourceConfig | None = None,
209212
) -> ExecutorStep:
210213
"""
211214
Tokenizes a dataset using the specified tokenizer and Levanter's tokenization infrastructure.
@@ -228,6 +231,11 @@ def default_tokenize(
228231
An ExecutorStep that represents the tokenized dataset.
229232
"""
230233

234+
# Common kwargs for config constructors
235+
extra_kwargs: dict = {}
236+
if worker_resources is not None:
237+
extra_kwargs["worker_resources"] = worker_resources
238+
231239
# sniff out if it's a HuggingFace dataset
232240
if isinstance(dataset, HfDatasetSpec):
233241
config = HfTokenizeConfig(
@@ -237,6 +245,8 @@ def default_tokenize(
237245
tokenizer=ensure_versioned(tokenizer),
238246
format=format,
239247
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
248+
levanter_batch_size=levanter_batch_size,
249+
**extra_kwargs,
240250
)
241251
elif (
242252
isinstance(dataset, str)
@@ -250,6 +260,8 @@ def default_tokenize(
250260
tokenizer=ensure_versioned(tokenizer),
251261
format=format,
252262
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
263+
levanter_batch_size=levanter_batch_size,
264+
**extra_kwargs,
253265
)
254266
else:
255267
config = TokenizeConfig(
@@ -259,14 +271,16 @@ def default_tokenize(
259271
tokenizer=ensure_versioned(tokenizer),
260272
format=format,
261273
sample_count=ensure_versioned(sample_count) if sample_count is not None else None,
274+
levanter_batch_size=levanter_batch_size,
275+
**extra_kwargs,
262276
)
263277

264278
return ExecutorStep(
265279
name=os.path.join("tokenized", name),
266280
description=f"Tokenize raw text using the {tokenizer} tokenizer.",
267281
fn=remote(
268282
tokenize,
269-
resources=ResourceConfig.with_cpu(cpu=4, ram="16g", disk="10g"),
283+
resources=resources or ResourceConfig.with_cpu(cpu=4, ram="16g", disk="10g"),
270284
pip_dependency_groups=["cpu"],
271285
env_vars={
272286
"TRANSFORMERS_NO_TORCH": "1",

experiments/ferries/datakit_ferry.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@
77
Output paths are placed under ``$MARIN_PREFIX/datakit-smoke/$SMOKE_RUN_ID/...``.
88
"""
99

10+
import json
1011
import logging
1112
import os
1213

13-
from rigging.filesystem import marin_temp_bucket
14+
from rigging.filesystem import marin_temp_bucket, url_to_fs
1415
from rigging.log_setup import configure_logging
1516

1617
from fray import ResourceConfig
@@ -109,14 +110,30 @@ def build_steps(run_id: str) -> list[StepSpec]:
109110
return [downloaded, normalized, deduped, consolidated, tokenized]
110111

111112

113+
def _write_status(status: str, marin_prefix: str) -> None:
114+
"""Write ferry run status to FERRY_STATUS_PATH if set."""
115+
status_path = os.environ.get("FERRY_STATUS_PATH")
116+
if not status_path:
117+
return
118+
payload = json.dumps({"status": status, "marin_prefix": marin_prefix})
119+
fs, _ = url_to_fs(status_path)
120+
with fs.open(status_path, "w") as f:
121+
f.write(payload)
122+
logger.info("Wrote ferry status to %s", status_path)
123+
124+
112125
def main() -> None:
113126
configure_logging()
114127
if not os.environ.get("MARIN_PREFIX"):
115128
os.environ["MARIN_PREFIX"] = marin_temp_bucket(ttl_days=1)
116129

117-
logger.info("MARIN_PREFIX defaulted to %s", os.environ["MARIN_PREFIX"])
130+
marin_prefix = os.environ["MARIN_PREFIX"]
131+
logger.info("MARIN_PREFIX defaulted to %s", marin_prefix)
118132
run_id = os.environ["SMOKE_RUN_ID"]
133+
134+
_write_status("running", marin_prefix)
119135
StepRunner().run(build_steps(run_id))
136+
_write_status("succeeded", marin_prefix)
120137

121138

122139
if __name__ == "__main__":

0 commit comments

Comments
 (0)