marin-community · rjpower · Mar 27, 2026 · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/.github/workflows/iris-coreweave-ci.yaml b/.github/workflows/iris-coreweave-ci.yaml
@@ -0,0 +1,202 @@
+name: Iris - CoreWeave CI
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+    paths:
+      - "lib/iris/**"
+  issue_comment:
+    types: [created]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  packages: write
+  pull-requests: read   # needed for issue_comment to access PR metadata
+  statuses: write       # post commit status from issue_comment trigger
+
+# Single concurrency group — only one CW CI run at a time across all PRs.
+# The warm cluster is shared; concurrent runs would conflict.
+concurrency:
+  group: iris-coreweave-ci
+  cancel-in-progress: false
+
+jobs:
+  cw-ci-test:
+    if: >-
+      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
+      github.event_name == 'workflow_dispatch' ||
+      (
+        github.event_name == 'issue_comment' &&
+        github.event.issue.pull_request &&
+        contains(github.event.comment.body, '/iris-ci-cw') &&
+        (
+          github.event.comment.author_association == 'MEMBER' ||
+          github.event.comment.author_association == 'COLLABORATOR' ||
+          github.event.comment.author_association == 'OWNER'
+        )
+      )
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    env:
+      IRIS_NAMESPACE: iris-ci
+      # Must match Labels(label_prefix).iris_managed from the cluster config
+      IRIS_MANAGED_LABEL: iris-iris-ci-managed
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }}
+
+      - name: Set commit status to pending
+        if: github.event_name == 'issue_comment'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          sha=$(git rev-parse HEAD)
+          gh api repos/${{ github.repository }}/statuses/"$sha" \
+            -f state=pending \
+            -f context="Iris CoreWeave CI" \
+            -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          cache-dependency-glob: "lib/iris/pyproject.toml"
+
+      - name: Write kubeconfig
+        run: |
+          mkdir -p ~/.kube
+          echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
+          chmod 600 ~/.kube/coreweave-iris
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # Delete stale worker pods so the autoscaler recreates them with fresh images.
+      # Nodepools (and their underlying nodes) survive — this is the "warm start".
+      - name: Reset worker pods
+        run: |
+          export KUBECONFIG=~/.kube/coreweave-iris
+          kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true
+
+      # Rebuild images and (re)start the controller. `cluster start` is fully
+      # idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service
+      # and triggers a rollout restart, so both cold starts and warm restarts
+      # work without needing to tunnel to an existing controller first.
+      - name: Start controller
+        env:
+          R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
+          R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
+        run: |
+          cd lib/iris && uv run --group dev iris -v \
+            --config=examples/coreweave-ci.yaml \
+            cluster start
+
+      - name: Run integration tests
+        env:
+          WANDB_MODE: disabled
+          WANDB_API_KEY: ""
+          JAX_TRACEBACK_FILTERING: off
+          # When set, the marin-on-iris test uploads fixtures and writes
+          # intermediate data to S3 (R2) so remote Zephyr pods can access them.
+          MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
+          AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
+          AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
+          FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
+        run: |
+          export KUBECONFIG=~/.kube/coreweave-iris
+          kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc 10000:10000 &
+          PF_PID=$!
+          echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
+
+          IRIS_CONTROLLER_URL="http://localhost:10000"
+
+          # Controller deployment is already confirmed ready by `cluster start`;
+          # this just waits for the port-forward to be usable.
+          HEALTHY=false
+          for i in $(seq 1 60); do
+            if ! kill -0 "$PF_PID" 2>/dev/null; then
+              echo "port-forward process died unexpectedly"
+              exit 1
+            fi
+            if curl -sf "$IRIS_CONTROLLER_URL/health" > /dev/null 2>&1; then
+              HEALTHY=true
+              break
+            fi
+            sleep 5
+          done
+          if [ "$HEALTHY" != "true" ]; then
+            echo "Controller did not become healthy within timeout"
+            exit 1
+          fi
+
+          uv run pytest tests/integration/iris/ \
+            --controller-url "$IRIS_CONTROLLER_URL" \
+            -v --tb=short --timeout=600 \
+            -o "addopts=" \
+            -x
+
+      - name: Run full integration pipeline
+        env:
+          WANDB_MODE: disabled
+          WANDB_API_KEY: ""
+          JAX_TRACEBACK_FILTERING: off
+          MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
+          AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
+          AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
+          FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
+        run: |
+          IRIS_CONTROLLER_URL="http://localhost:10000"
+          timeout 600 uv run tests/integration/iris/run_iris_full_integration.py \
+            --controller-url "$IRIS_CONTROLLER_URL"
+
+      - name: Stop port-forward
+        if: always()
+        run: |
+          [ -n "$PF_PID" ] && kill "$PF_PID" 2>/dev/null || true
+          pkill -f "kubectl port-forward.*$IRIS_NAMESPACE" 2>/dev/null || true
+
+      - name: Capture failure diagnostics
+        if: failure()
+        run: |
+          export KUBECONFIG=~/.kube/coreweave-iris
+          echo "=== Controller logs ==="
+          kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=500 || true
+          echo "=== Controller pod describe ==="
+          kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller || true
+          echo "=== Worker pods ==="
+          kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" || true
+          echo "=== Warning events ==="
+          kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true
+
+      - name: Set commit status to result
+        if: always() && github.event_name == 'issue_comment'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          sha=$(git rev-parse HEAD)
+          if [ "${{ job.status }}" = "success" ]; then
+            state=success
+          else
+            state=failure
+          fi
+          gh api repos/${{ github.repository }}/statuses/"$sha" \
+            -f state="$state" \
+            -f context="Iris CoreWeave CI" \
+            -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
diff --git a/.github/workflows/iris-integration.yaml b/.github/workflows/iris-integration.yaml
@@ -69,14 +69,23 @@ jobs:
         run: |
           uv run pytest tests/integration/iris/ \
             --controller-url "$IRIS_CONTROLLER_URL" \
-            -v --tb=short --timeout=600 \
+            -v -s --log-cli-level=INFO --tb=short --timeout=600 \
             -o "addopts=" \
             -x
         env:
           WANDB_MODE: disabled
           WANDB_API_KEY: ""
           JAX_TRACEBACK_FILTERING: off
 
+      - name: Run full integration pipeline
+        run: |
+          timeout 600 uv run tests/integration/iris/run_iris_full_integration.py \
+            --controller-url "$IRIS_CONTROLLER_URL"
+        env:
+          WANDB_MODE: disabled
+          WANDB_API_KEY: ""
+          JAX_TRACEBACK_FILTERING: off
+
       - name: Stop cluster
         if: always()
         run: kill $CLUSTER_PID 2>/dev/null || true

diff --git a/lib/iris/examples/coreweave-ci.yaml b/lib/iris/examples/coreweave-ci.yaml
@@ -0,0 +1,91 @@
+# Persistent CoreWeave CI cluster. Both scale groups are pinned at min=max=1
+# so nodes stay warm between runs — only controller and worker pods are reset.
+
+platform:
+  label_prefix: iris-ci
+  coreweave:
+    region: US-WEST-04A
+    namespace: iris-ci
+    kubeconfig_path: ~/.kube/coreweave-iris
+    object_storage_endpoint: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
+
+storage:
+  remote_state_dir: s3://marin-na/iris/state/ci
+
+kubernetes_provider:
+  namespace: iris-ci
+  default_image: ghcr.io/marin-community/iris-task:latest
+  host_network: true
+  cache_dir: /mnt/local/iris-cache
+  controller_address: http://iris-ci-controller-svc.iris-ci.svc.cluster.local:10000
+
+controller:
+  image: ghcr.io/marin-community/iris-controller:latest
+  coreweave:
+    port: 10000
+    service_name: iris-ci-controller-svc
+    scale_group: cpu-erapids
+
+defaults:
+  autoscaler:
+    evaluation_interval:
+      milliseconds: 10000
+    scale_up_delay:
+      milliseconds: 60000
+    scale_down_delay:
+      milliseconds: 300000
+    startup_grace_period:
+      milliseconds: 1200000    # 20 min — nodes are pinned warm so this rarely fires
+  task_env:
+    MARIN_PREFIX: s3://marin-na/marin
+  worker:
+    docker_image: ghcr.io/marin-community/iris-worker:latest
+    port: 10001
+    cache_dir: /mnt/local/iris-cache
+    runtime: kubernetes
+    default_task_image: ghcr.io/marin-community/iris-task:latest
+
+scale_groups:
+  cpu-erapids:
+    num_vms: 1
+    resources:
+      cpu: 64
+      ram: 256GB
+      disk: 1TB
+      device_type: cpu
+      preemptible: false
+    worker:
+      attributes:
+        region: US-WEST-04A
+        pool: cpu-erapids
+    min_slices: 1
+    max_slices: 1
+    priority: 50
+    slice_template:
+      num_vms: 1
+      coreweave:
+        region: US-WEST-04A
+        instance_type: cd-gp-i64-erapids
+
+  h100-8x:
+    num_vms: 1
+    resources:
+      cpu: 128
+      ram: 2048GB
+      disk: 1TB
+      device_type: gpu
+      device_variant: H100
+      device_count: 8
+      preemptible: false
+    worker:
+      attributes:
+        region: US-WEST-04A
+        pool: h100-8x
+    min_slices: 1
+    max_slices: 1
+    priority: 100
+    slice_template:
+      num_vms: 1
+      coreweave:
+        region: US-WEST-04A
+        instance_type: gd-8xh100ib-i128
diff --git a/lib/iris/src/iris/cluster/controller/service.py b/lib/iris/src/iris/cluster/controller/service.py
@@ -1870,6 +1870,18 @@ def exec_in_container(
 
         task_worker_id = task.worker_id
         if not task_worker_id:
+            if self._controller.has_direct_provider:
+                provider = self._controller.provider
+                timeout = request.timeout_seconds if request.timeout_seconds else 60
+                resp = provider.exec_in_container(
+                    task.task_id.to_wire(), task.current_attempt_id, list(request.command), timeout
+                )
+                return cluster_pb2.Controller.ExecInContainerResponse(
+                    exit_code=resp.exit_code,
+                    stdout=resp.stdout,
+                    stderr=resp.stderr,
+                    error=resp.error,
+                )
             raise ConnectError(Code.FAILED_PRECONDITION, f"Task {request.task_id} not assigned to a worker")
 
         worker = _read_worker(self._db, task_worker_id)

diff --git a/lib/iris/src/iris/cluster/providers/k8s/tasks.py b/lib/iris/src/iris/cluster/providers/k8s/tasks.py
@@ -693,6 +693,26 @@ def profile_task(
         except Exception as e:
             return cluster_pb2.ProfileTaskResponse(error=str(e))
 
+    def exec_in_container(
+        self,
+        task_id: str,
+        attempt_id: int,
+        command: list[str],
+        timeout_seconds: int = 60,
+    ) -> cluster_pb2.Worker.ExecInContainerResponse:
+        """Execute a command in a running task pod via kubectl exec."""
+        pod_name = _pod_name(JobName.from_wire(task_id), attempt_id)
+        effective_timeout: float | None = timeout_seconds if timeout_seconds >= 0 else None
+        try:
+            result = self.kubectl.exec(pod_name, command, container="task", timeout=effective_timeout)
+            return cluster_pb2.Worker.ExecInContainerResponse(
+                exit_code=result.returncode,
+                stdout=result.stdout,
+                stderr=result.stderr,
+            )
+        except Exception as e:
+            return cluster_pb2.Worker.ExecInContainerResponse(error=str(e))
+
     def close(self) -> None:
         """No persistent resources to release."""
 

diff --git a/lib/marin/src/marin/processing/classification/classifier.py b/lib/marin/src/marin/processing/classification/classifier.py
@@ -81,7 +81,7 @@ def load_model(self):
 
         with FileLock(lock_file):
             if not os.path.exists(success_file):
-                fs.makedirs(f"/tmp/{model_descriptor}", exist_ok=True)
+                os.makedirs(f"/tmp/{model_descriptor}", exist_ok=True)
 
                 if is_remote_or_local_path:
                     fs.get(fs_path, local_filepath)