|
| 1 | +name: Zephyr - Shuffle Integration Tests |
| 2 | + |
| 3 | +# Exercises the zephyr scatter/reduce shuffle at 10 GB across 4 scenarios |
| 4 | +# (uniform/skew × small/large items) by submitting iris jobs to marin-dev. |
| 5 | +# Each scenario runs as its own matrix leg and polls the iris job to a |
| 6 | +# terminal state. |
| 7 | + |
| 8 | +on: |
| 9 | + # Manual only for now — baseline Parquet shuffle OOMs on skew scenarios, |
| 10 | + # so a scheduled run would fail until the zstd-chunk shuffle format lands. |
| 11 | + # Add a cron once the shuffle format change is in main. |
| 12 | + workflow_dispatch: |
| 13 | + inputs: |
| 14 | + num_input_shards: |
| 15 | + description: Input shard count (default 64) |
| 16 | + required: false |
| 17 | + default: '64' |
| 18 | + |
| 19 | +permissions: |
| 20 | + contents: read |
| 21 | + id-token: write |
| 22 | + |
| 23 | +jobs: |
| 24 | + shuffle-itest: |
| 25 | + runs-on: ubuntu-latest |
| 26 | + timeout-minutes: 90 |
| 27 | + concurrency: |
| 28 | + group: zephyr-shuffle-itest-${{ matrix.scenario }} |
| 29 | + cancel-in-progress: true |
| 30 | + |
| 31 | + strategy: |
| 32 | + fail-fast: false |
| 33 | + matrix: |
| 34 | + include: |
| 35 | + - scenario: uniform-small |
| 36 | + items_per_shard: '600000' |
| 37 | + item_bytes: '250' |
| 38 | + hot_shard_frac: '0.0' |
| 39 | + hot_key_pool: '0' |
| 40 | + - scenario: uniform-large |
| 41 | + items_per_shard: '160' |
| 42 | + item_bytes: '1000000' |
| 43 | + hot_shard_frac: '0.0' |
| 44 | + hot_key_pool: '0' |
| 45 | + - scenario: skew90-small |
| 46 | + items_per_shard: '600000' |
| 47 | + item_bytes: '250' |
| 48 | + hot_shard_frac: '0.9' |
| 49 | + hot_key_pool: '128' |
| 50 | + - scenario: skew90-large |
| 51 | + items_per_shard: '160' |
| 52 | + item_bytes: '1000000' |
| 53 | + hot_shard_frac: '0.9' |
| 54 | + hot_key_pool: '128' |
| 55 | + |
| 56 | + env: |
| 57 | + RUN_ID: zephyr-shuffle-itest-${{ matrix.scenario }}-${{ github.run_id }}-${{ github.run_attempt }} |
| 58 | + IRIS_CONFIG: lib/iris/examples/marin-dev.yaml |
| 59 | + IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com |
| 60 | + NUM_INPUT_SHARDS: ${{ github.event.inputs.num_input_shards || '64' }} |
| 61 | + |
| 62 | + steps: |
| 63 | + - name: Checkout code |
| 64 | + uses: actions/checkout@v4 |
| 65 | + |
| 66 | + - name: Set up Python 3.12 |
| 67 | + uses: actions/setup-python@v5 |
| 68 | + with: |
| 69 | + python-version: "3.12" |
| 70 | + |
| 71 | + - name: Install uv |
| 72 | + uses: astral-sh/setup-uv@v7 |
| 73 | + with: |
| 74 | + enable-cache: true |
| 75 | + |
| 76 | + - name: Install dependencies |
| 77 | + run: uv sync --all-packages --extra=cpu --no-default-groups |
| 78 | + |
| 79 | + - name: Authenticate to Google Cloud |
| 80 | + uses: google-github-actions/auth@v2 |
| 81 | + with: |
| 82 | + credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} |
| 83 | + |
| 84 | + - name: Set up Google Cloud SDK |
| 85 | + uses: google-github-actions/setup-gcloud@v2 |
| 86 | + with: |
| 87 | + project_id: ${{ secrets.GCP_PROJECT_ID }} |
| 88 | + |
| 89 | + - name: Set up OS Login SSH key |
| 90 | + run: | |
| 91 | + mkdir -p ~/.ssh |
| 92 | + ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.scenario }}" |
| 93 | + chmod 600 ~/.ssh/google_compute_engine |
| 94 | + gcloud compute os-login ssh-keys add \ |
| 95 | + --key-file ~/.ssh/google_compute_engine.pub \ |
| 96 | + --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \ |
| 97 | + --ttl=6h |
| 98 | +
|
| 99 | + - name: Submit shuffle benchmark |
| 100 | + id: submit |
| 101 | + shell: bash -l {0} |
| 102 | + run: | |
| 103 | + JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ |
| 104 | + job run --no-wait \ |
| 105 | + --memory=2G --disk=8G --cpu=1 --extra=cpu \ |
| 106 | + -e SMOKE_RUN_ID "$RUN_ID" \ |
| 107 | + -- python lib/zephyr/tests/benchmark_shuffle.py \ |
| 108 | + --num-input-shards "$NUM_INPUT_SHARDS" \ |
| 109 | + --items-per-shard "${{ matrix.items_per_shard }}" \ |
| 110 | + --item-bytes "${{ matrix.item_bytes }}" \ |
| 111 | + --num-keys 50000 \ |
| 112 | + --max-workers 4 --worker-cpu 1 --worker-ram 8g \ |
| 113 | + --hot-shard-frac "${{ matrix.hot_shard_frac }}" \ |
| 114 | + --hot-key-pool "${{ matrix.hot_key_pool }}" \ |
| 115 | + --label "$RUN_ID") |
| 116 | + echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT" |
| 117 | + echo "Submitted job: $JOB_ID" |
| 118 | +
|
| 119 | + - name: Wait for shuffle benchmark |
| 120 | + shell: bash -l {0} |
| 121 | + run: | |
| 122 | + JOB_ID="${{ steps.submit.outputs.job_id }}" |
| 123 | + echo "Polling job status: $JOB_ID" |
| 124 | + while true; do |
| 125 | + STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ |
| 126 | + job list --json --prefix "$JOB_ID" \ |
| 127 | + | jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty') |
| 128 | + case "$STATE" in |
| 129 | + JOB_STATE_SUCCEEDED) |
| 130 | + echo "Job succeeded" |
| 131 | + exit 0 |
| 132 | + ;; |
| 133 | + JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING) |
| 134 | + echo "$(date -u +%H:%M:%S) Job state: $STATE" |
| 135 | + sleep 30 |
| 136 | + ;; |
| 137 | + "") |
| 138 | + echo "Job not found: $JOB_ID" |
| 139 | + exit 1 |
| 140 | + ;; |
| 141 | + *) |
| 142 | + echo "Job finished with state: $STATE" |
| 143 | + .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ |
| 144 | + job list --json --prefix "$JOB_ID" \ |
| 145 | + | jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true |
| 146 | + exit 1 |
| 147 | + ;; |
| 148 | + esac |
| 149 | + done |
| 150 | +
|
| 151 | + - name: Print benchmark result |
| 152 | + if: success() |
| 153 | + shell: bash -l {0} |
| 154 | + run: | |
| 155 | + JOB_ID="${{ steps.submit.outputs.job_id }}" |
| 156 | + .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ |
| 157 | + job logs "$JOB_ID" --max-lines 20 2>/dev/null \ |
| 158 | + | grep "RESULT:" | tail -1 || echo "No RESULT line found" |
| 159 | +
|
| 160 | + - name: Capture failure diagnostics |
| 161 | + if: failure() |
| 162 | + shell: bash -l {0} |
| 163 | + run: | |
| 164 | + JOB_ID="${{ steps.submit.outputs.job_id }}" |
| 165 | + echo "=== Job summary ===" |
| 166 | + .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ |
| 167 | + job summary "$JOB_ID" 2>/dev/null || true |
| 168 | + echo "=== Recent logs ===" |
| 169 | + .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ |
| 170 | + job logs "$JOB_ID" --max-lines 100 2>/dev/null | tail -60 || true |
| 171 | +
|
| 172 | + - name: Remove OS Login SSH key |
| 173 | + if: always() |
| 174 | + run: | |
| 175 | + gcloud compute os-login ssh-keys remove \ |
| 176 | + --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \ |
| 177 | + --key-file ~/.ssh/google_compute_engine.pub || true |
0 commit comments