marin-community
diff --git a/‎.github/workflows/zephyr-shuffle-itest.yaml‎
Lines changed: 178 additions & 0 deletions b/‎.github/workflows/zephyr-shuffle-itest.yaml‎
Lines changed: 178 additions & 0 deletions
@@ -0,0 +1,178 @@
+name: Zephyr - Shuffle Integration Tests
+
+# Exercises the zephyr scatter/reduce shuffle at 10 GB across 4 scenarios
+# (uniform/skew × small/large items) by submitting iris jobs to marin-dev.
+# Each scenario runs as its own matrix leg and polls the iris job to a
+# terminal state.
+
+on:
+  # Manual only for now — baseline Parquet shuffle OOMs on skew scenarios,
+  # so a scheduled run would fail until the zstd-chunk shuffle format lands.
+  # Add a cron once the shuffle format change is in main.
+  workflow_dispatch:
+    inputs:
+      num_input_shards:
+        description: Input shard count (default 64)
+        required: false
+        default: '64'
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  shuffle-itest:
+    runs-on: ubuntu-latest
+    timeout-minutes: 180
+    concurrency:
+      group: zephyr-shuffle-itest-${{ matrix.scenario }}
+      cancel-in-progress: true
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - scenario: uniform-small
+            items_per_shard: '600000'
+            item_bytes: '250'
+            hot_shard_frac: '0.0'
+            hot_key_pool: '0'
+          - scenario: uniform-large
+            items_per_shard: '160'
+            item_bytes: '1000000'
+            hot_shard_frac: '0.0'
+            hot_key_pool: '0'
+          - scenario: skew90-small
+            items_per_shard: '600000'
+            item_bytes: '250'
+            hot_shard_frac: '0.9'
+            hot_key_pool: '128'
+          - scenario: skew90-large
+            items_per_shard: '160'
+            item_bytes: '1000000'
+            hot_shard_frac: '0.9'
+            hot_key_pool: '128'
+
+    env:
+      RUN_ID: zephyr-shuffle-itest-${{ matrix.scenario }}-${{ github.run_id }}-${{ github.run_attempt }}
+      IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
+      IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
+      NUM_INPUT_SHARDS: ${{ github.event.inputs.num_input_shards || '64' }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv sync --all-packages --extra=cpu --no-default-groups
+
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@v2
+        with:
+          credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}
+
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
+        with:
+          project_id: ${{ secrets.GCP_PROJECT_ID }}
+
+      - name: Set up OS Login SSH key
+        run: |
+          mkdir -p ~/.ssh
+          ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.scenario }}"
+          chmod 600 ~/.ssh/google_compute_engine
+          gcloud compute os-login ssh-keys add \
+            --key-file ~/.ssh/google_compute_engine.pub \
+            --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
+            --ttl=6h
+
+      - name: Submit shuffle benchmark
+        id: submit
+        shell: bash -l {0}
+        run: |
+          JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job run --no-wait --priority production \
+            --memory=2G --disk=8G --cpu=1 --extra=cpu \
+            -e SMOKE_RUN_ID "$RUN_ID" \
+            -- python lib/zephyr/tests/benchmark_shuffle.py \
+                 --num-input-shards "$NUM_INPUT_SHARDS" \
+                 --items-per-shard "${{ matrix.items_per_shard }}" \
+                 --item-bytes "${{ matrix.item_bytes }}" \
+                 --num-keys 50000 \
+                 --max-workers 4 --worker-cpu 1 --worker-ram 8g \
+                 --hot-shard-frac "${{ matrix.hot_shard_frac }}" \
+                 --hot-key-pool "${{ matrix.hot_key_pool }}" \
+                 --repeat 3 \
+                 --label "$RUN_ID")
+          echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
+          echo "Submitted job: $JOB_ID"
+
+      - name: Wait for shuffle benchmark
+        shell: bash -l {0}
+        run: |
+          JOB_ID="${{ steps.submit.outputs.job_id }}"
+          echo "Polling job status: $JOB_ID"
+          while true; do
+            STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+              job list --json --prefix "$JOB_ID" \
+              | jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
+            case "$STATE" in
+              JOB_STATE_SUCCEEDED)
+                echo "Job succeeded"
+                exit 0
+                ;;
+              JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
+                echo "$(date -u +%H:%M:%S) Job state: $STATE"
+                sleep 30
+                ;;
+              "")
+                echo "Job not found: $JOB_ID"
+                exit 1
+                ;;
+              *)
+                echo "Job finished with state: $STATE"
+                .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+                  job list --json --prefix "$JOB_ID" \
+                  | jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
+                exit 1
+                ;;
+            esac
+          done
+
+      - name: Print benchmark results
+        if: success()
+        shell: bash -l {0}
+        run: |
+          JOB_ID="${{ steps.submit.outputs.job_id }}"
+          .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job logs "$JOB_ID" --max-lines 200 2>/dev/null \
+            | grep "RESULT:" || echo "No RESULT lines found"
+
+      - name: Capture failure diagnostics
+        if: failure()
+        shell: bash -l {0}
+        run: |
+          JOB_ID="${{ steps.submit.outputs.job_id }}"
+          echo "=== Job summary ==="
+          .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job summary "$JOB_ID" 2>/dev/null || true
+          echo "=== Recent logs ==="
+          .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
+            job logs "$JOB_ID" --max-lines 100 2>/dev/null | tail -60 || true
+
+      - name: Remove OS Login SSH key
+        if: always()
+        run: |
+          gcloud compute os-login ssh-keys remove \
+            --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
+            --key-file ~/.ssh/google_compute_engine.pub || true