Skip to content

Agent MoE Experiment: E=128 experts (up from E=64) #1963

Agent MoE Experiment: E=128 experts (up from E=64)

Agent MoE Experiment: E=128 experts (up from E=64) #1963

name: Iris - CoreWeave CI
on:
pull_request:
types: [opened, synchronize]
paths:
- "lib/iris/**"
issue_comment:
types: [created]
workflow_dispatch:
permissions:
contents: read
packages: write
pull-requests: read # needed for issue_comment to access PR metadata
statuses: write # post commit status from issue_comment trigger
# Shared concurrency group with marin-canary-ferry-cw.yaml — both rebuild/roll
# the shared iris-ci controller and submit against the shared H100 in
# US-WEST-04A. Only one run cluster-wide at a time. cancel-in-progress=false
# so a mid-flight canary is not killed by a PR firing.
concurrency:
group: iris-coreweave-ci-shared
cancel-in-progress: false
jobs:
cw-ci-test:
if: >-
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
github.event_name == 'workflow_dispatch' ||
(
github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
contains(github.event.comment.body, '/iris-ci-cw') &&
(
github.event.comment.author_association == 'MEMBER' ||
github.event.comment.author_association == 'COLLABORATOR' ||
github.event.comment.author_association == 'OWNER'
)
)
runs-on: ubuntu-latest
timeout-minutes: 60
env:
IRIS_NAMESPACE: iris-ci
# Must match Labels(label_prefix).iris_managed from the cluster config
IRIS_MANAGED_LABEL: iris-iris-ci-managed
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }}
- name: Set commit status to pending
if: github.event_name == 'issue_comment'
env:
GH_TOKEN: ${{ github.token }}
run: |
sha=$(git rev-parse HEAD)
gh api repos/${{ github.repository }}/statuses/"$sha" \
-f state=pending \
-f context="Iris CoreWeave CI" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "lib/iris/pyproject.toml"
- name: Write kubeconfig
run: |
mkdir -p ~/.kube
echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
chmod 600 ~/.kube/coreweave-iris
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
# Delete stale worker pods so the autoscaler recreates them with fresh images.
# Nodepools (and their underlying nodes) survive — this is the "warm start".
- name: Reset worker pods
run: |
export KUBECONFIG=~/.kube/coreweave-iris
kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true
# Rebuild images and (re)start the controller. `cluster start` is fully
# idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service
# and triggers a rollout restart, so both cold starts and warm restarts
# work without needing to tunnel to an existing controller first.
- name: Start controller
env:
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
run: |
cd lib/iris && uv run --group dev iris -v \
--config=examples/coreweave-ci.yaml \
cluster start --fresh
- name: Run integration tests
env:
WANDB_MODE: disabled
WANDB_API_KEY: ""
JAX_TRACEBACK_FILTERING: off
# When set, the marin-on-iris test uploads fixtures and writes
# intermediate data to S3 (R2) so remote Zephyr pods can access them.
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
run: |
export KUBECONFIG=~/.kube/coreweave-iris
# Wait for rollout to fully settle (old pod terminated, exactly 1 ready).
kubectl rollout status deployment/iris-controller -n "$IRIS_NAMESPACE" --timeout=120s
kubectl wait pod -n "$IRIS_NAMESPACE" -l app=iris-controller \
--for=condition=Ready --timeout=60s
LOCAL_PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('',0)); print(s.getsockname()[1]); s.close()")
kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" &
PF_PID=$!
echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
echo "LOCAL_PORT=$LOCAL_PORT" >> "$GITHUB_ENV"
IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}"
# Wait for the port-forward tunnel to be usable.
HEALTHY=false
for i in $(seq 1 60); do
if ! kill -0 "$PF_PID" 2>/dev/null; then
echo "port-forward process died — restarting"
kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" &
PF_PID=$!
echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
sleep 2
continue
fi
if curl -sf "$IRIS_CONTROLLER_URL/health" > /dev/null 2>&1; then
HEALTHY=true
break
fi
sleep 5
done
if [ "$HEALTHY" != "true" ]; then
echo "Controller did not become healthy within timeout"
exit 1
fi
uv run pytest tests/integration/iris/ \
--controller-url "$IRIS_CONTROLLER_URL" \
-v --tb=short --timeout=600 \
-o "addopts=" \
-x
- name: Run full integration pipeline
env:
WANDB_MODE: disabled
WANDB_API_KEY: ""
JAX_TRACEBACK_FILTERING: off
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
run: |
export IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}"
timeout 600 uv run pytest tests/test_integration_test.py \
-m integration -o "addopts=" --timeout=600 -v -s
- name: Stop port-forward
if: always()
run: |
[ -n "$PF_PID" ] && kill "$PF_PID" 2>/dev/null || true
pkill -f "kubectl port-forward.*$IRIS_NAMESPACE" 2>/dev/null || true
- name: Capture failure diagnostics
if: failure()
env:
LOG_DIR: ${{ github.workspace }}/iris-cw-logs
run: |
export KUBECONFIG=~/.kube/coreweave-iris
mkdir -p "$LOG_DIR"
# Stream to the GH Actions log for quick triage…
echo "=== Controller logs (tail) ==="
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=500 || true
echo "=== Controller pod describe ==="
kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller || true
echo "=== Worker pods ==="
kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" || true
echo "=== Warning events ==="
kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true
# …and also persist per-pod logs + describe so failures in worker
# containers are recoverable from the uploaded artifact, not just
# the controller's view.
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers \
> "$LOG_DIR/controller.log" 2>&1 || true
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers --previous \
> "$LOG_DIR/controller-previous.log" 2>&1 || true
kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller \
> "$LOG_DIR/controller-describe.txt" 2>&1 || true
for pod in $(kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" -o name 2>/dev/null); do
safe=$(echo "$pod" | tr '/' '-')
kubectl -n "$IRIS_NAMESPACE" logs "$pod" --tail=-1 --all-containers \
> "$LOG_DIR/${safe}.log" 2>&1 || true
kubectl -n "$IRIS_NAMESPACE" describe "$pod" \
> "$LOG_DIR/${safe}-describe.txt" 2>&1 || true
done
kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' \
> "$LOG_DIR/events.txt" 2>&1 || true
- name: Upload failure diagnostics
if: failure()
uses: actions/upload-artifact@v4
with:
name: iris-cw-ci-logs
path: iris-cw-logs/
retention-days: 14
if-no-files-found: ignore
- name: Set commit status to result
if: always() && github.event_name == 'issue_comment'
env:
GH_TOKEN: ${{ github.token }}
run: |
sha=$(git rev-parse HEAD)
if [ "${{ job.status }}" = "success" ]; then
state=success
else
state=failure
fi
gh api repos/${{ github.repository }}/statuses/"$sha" \
-f state="$state" \
-f context="Iris CoreWeave CI" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"