Log-store: narrow per-segment key pruning so reads don't need the full cap #1948
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris - CoreWeave CI | |
| on: | |
| pull_request: | |
| types: [opened, synchronize] | |
| paths: | |
| - "lib/iris/**" | |
| issue_comment: | |
| types: [created] | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| packages: write | |
| pull-requests: read # needed for issue_comment to access PR metadata | |
| statuses: write # post commit status from issue_comment trigger | |
| # Shared concurrency group with marin-canary-ferry-cw.yaml — both rebuild/roll | |
| # the shared iris-ci controller and submit against the shared H100 in | |
| # US-WEST-04A. Only one run cluster-wide at a time. cancel-in-progress=false | |
| # so a mid-flight canary is not killed by a PR firing. | |
| concurrency: | |
| group: iris-coreweave-ci-shared | |
| cancel-in-progress: false | |
| jobs: | |
| cw-ci-test: | |
| if: >- | |
| (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || | |
| github.event_name == 'workflow_dispatch' || | |
| ( | |
| github.event_name == 'issue_comment' && | |
| github.event.issue.pull_request && | |
| contains(github.event.comment.body, '/iris-ci-cw') && | |
| ( | |
| github.event.comment.author_association == 'MEMBER' || | |
| github.event.comment.author_association == 'COLLABORATOR' || | |
| github.event.comment.author_association == 'OWNER' | |
| ) | |
| ) | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| env: | |
| IRIS_NAMESPACE: iris-ci | |
| # Must match Labels(label_prefix).iris_managed from the cluster config | |
| IRIS_MANAGED_LABEL: iris-iris-ci-managed | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }} | |
| - name: Set commit status to pending | |
| if: github.event_name == 'issue_comment' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| sha=$(git rev-parse HEAD) | |
| gh api repos/${{ github.repository }}/statuses/"$sha" \ | |
| -f state=pending \ | |
| -f context="Iris CoreWeave CI" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Write kubeconfig | |
| run: | | |
| mkdir -p ~/.kube | |
| echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris | |
| chmod 600 ~/.kube/coreweave-iris | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| # Delete stale worker pods so the autoscaler recreates them with fresh images. | |
| # Nodepools (and their underlying nodes) survive — this is the "warm start". | |
| - name: Reset worker pods | |
| run: | | |
| export KUBECONFIG=~/.kube/coreweave-iris | |
| kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true | |
| # Rebuild images and (re)start the controller. `cluster start` is fully | |
| # idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service | |
| # and triggers a rollout restart, so both cold starts and warm restarts | |
| # work without needing to tunnel to an existing controller first. | |
| - name: Start controller | |
| env: | |
| R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=examples/coreweave-ci.yaml \ | |
| cluster start --fresh | |
| - name: Run integration tests | |
| env: | |
| WANDB_MODE: disabled | |
| WANDB_API_KEY: "" | |
| JAX_TRACEBACK_FILTERING: off | |
| # When set, the marin-on-iris test uploads fixtures and writes | |
| # intermediate data to S3 (R2) so remote Zephyr pods can access them. | |
| MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci | |
| AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com | |
| FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}' | |
| run: | | |
| export KUBECONFIG=~/.kube/coreweave-iris | |
| # Wait for rollout to fully settle (old pod terminated, exactly 1 ready). | |
| kubectl rollout status deployment/iris-controller -n "$IRIS_NAMESPACE" --timeout=120s | |
| kubectl wait pod -n "$IRIS_NAMESPACE" -l app=iris-controller \ | |
| --for=condition=Ready --timeout=60s | |
| LOCAL_PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('',0)); print(s.getsockname()[1]); s.close()") | |
| kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" & | |
| PF_PID=$! | |
| echo "PF_PID=$PF_PID" >> "$GITHUB_ENV" | |
| echo "LOCAL_PORT=$LOCAL_PORT" >> "$GITHUB_ENV" | |
| IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}" | |
| # Wait for the port-forward tunnel to be usable. | |
| HEALTHY=false | |
| for i in $(seq 1 60); do | |
| if ! kill -0 "$PF_PID" 2>/dev/null; then | |
| echo "port-forward process died — restarting" | |
| kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" & | |
| PF_PID=$! | |
| echo "PF_PID=$PF_PID" >> "$GITHUB_ENV" | |
| sleep 2 | |
| continue | |
| fi | |
| if curl -sf "$IRIS_CONTROLLER_URL/health" > /dev/null 2>&1; then | |
| HEALTHY=true | |
| break | |
| fi | |
| sleep 5 | |
| done | |
| if [ "$HEALTHY" != "true" ]; then | |
| echo "Controller did not become healthy within timeout" | |
| exit 1 | |
| fi | |
| uv run pytest tests/integration/iris/ \ | |
| --controller-url "$IRIS_CONTROLLER_URL" \ | |
| -v --tb=short --timeout=600 \ | |
| -o "addopts=" \ | |
| -x | |
| - name: Run full integration pipeline | |
| env: | |
| WANDB_MODE: disabled | |
| WANDB_API_KEY: "" | |
| JAX_TRACEBACK_FILTERING: off | |
| MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci | |
| AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com | |
| FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}' | |
| run: | | |
| export IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}" | |
| timeout 600 uv run pytest tests/test_integration_test.py \ | |
| -m integration -o "addopts=" --timeout=600 -v -s | |
| - name: Stop port-forward | |
| if: always() | |
| run: | | |
| [ -n "$PF_PID" ] && kill "$PF_PID" 2>/dev/null || true | |
| pkill -f "kubectl port-forward.*$IRIS_NAMESPACE" 2>/dev/null || true | |
| - name: Capture failure diagnostics | |
| if: failure() | |
| env: | |
| LOG_DIR: ${{ github.workspace }}/iris-cw-logs | |
| run: | | |
| export KUBECONFIG=~/.kube/coreweave-iris | |
| mkdir -p "$LOG_DIR" | |
| # Stream to the GH Actions log for quick triage… | |
| echo "=== Controller logs (tail) ===" | |
| kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=500 || true | |
| echo "=== Controller pod describe ===" | |
| kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller || true | |
| echo "=== Worker pods ===" | |
| kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" || true | |
| echo "=== Warning events ===" | |
| kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true | |
| # …and also persist per-pod logs + describe so failures in worker | |
| # containers are recoverable from the uploaded artifact, not just | |
| # the controller's view. | |
| kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers \ | |
| > "$LOG_DIR/controller.log" 2>&1 || true | |
| kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers --previous \ | |
| > "$LOG_DIR/controller-previous.log" 2>&1 || true | |
| kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller \ | |
| > "$LOG_DIR/controller-describe.txt" 2>&1 || true | |
| for pod in $(kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" -o name 2>/dev/null); do | |
| safe=$(echo "$pod" | tr '/' '-') | |
| kubectl -n "$IRIS_NAMESPACE" logs "$pod" --tail=-1 --all-containers \ | |
| > "$LOG_DIR/${safe}.log" 2>&1 || true | |
| kubectl -n "$IRIS_NAMESPACE" describe "$pod" \ | |
| > "$LOG_DIR/${safe}-describe.txt" 2>&1 || true | |
| done | |
| kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' \ | |
| > "$LOG_DIR/events.txt" 2>&1 || true | |
| - name: Upload failure diagnostics | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: iris-cw-ci-logs | |
| path: iris-cw-logs/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| - name: Set commit status to result | |
| if: always() && github.event_name == 'issue_comment' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| sha=$(git rev-parse HEAD) | |
| if [ "${{ job.status }}" = "success" ]; then | |
| state=success | |
| else | |
| state=failure | |
| fi | |
| gh api repos/${{ github.repository }}/statuses/"$sha" \ | |
| -f state="$state" \ | |
| -f context="Iris CoreWeave CI" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |