Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 178 additions & 0 deletions .github/workflows/zephyr-shuffle-itest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
name: Zephyr - Shuffle Integration Tests

# Exercises the zephyr scatter/reduce shuffle at 10 GB across 4 scenarios
# (uniform/skew × small/large items) by submitting iris jobs to marin-dev.
# Each scenario runs as its own matrix leg and polls the iris job to a
# terminal state.

on:
# Manual only for now — baseline Parquet shuffle OOMs on skew scenarios,
# so a scheduled run would fail until the zstd-chunk shuffle format lands.
# Add a cron once the shuffle format change is in main.
workflow_dispatch:
inputs:
num_input_shards:
description: Input shard count (default 64)
required: false
default: '64'

permissions:
contents: read
id-token: write

jobs:
shuffle-itest:
runs-on: ubuntu-latest
timeout-minutes: 180
concurrency:
group: zephyr-shuffle-itest-${{ matrix.scenario }}
cancel-in-progress: true

strategy:
fail-fast: false
matrix:
include:
- scenario: uniform-small
items_per_shard: '600000'
item_bytes: '250'
hot_shard_frac: '0.0'
hot_key_pool: '0'
- scenario: uniform-large
items_per_shard: '160'
item_bytes: '1000000'
hot_shard_frac: '0.0'
hot_key_pool: '0'
- scenario: skew90-small
items_per_shard: '600000'
item_bytes: '250'
hot_shard_frac: '0.9'
hot_key_pool: '128'
- scenario: skew90-large
items_per_shard: '160'
item_bytes: '1000000'
hot_shard_frac: '0.9'
hot_key_pool: '128'

env:
RUN_ID: zephyr-shuffle-itest-${{ matrix.scenario }}-${{ github.run_id }}-${{ github.run_attempt }}
IRIS_CONFIG: lib/iris/examples/marin-dev.yaml
IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
NUM_INPUT_SHARDS: ${{ github.event.inputs.num_input_shards || '64' }}

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true

- name: Install dependencies
run: uv sync --all-packages --extra=cpu --no-default-groups

- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}

- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v2
with:
project_id: ${{ secrets.GCP_PROJECT_ID }}

- name: Set up OS Login SSH key
run: |
mkdir -p ~/.ssh
ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.scenario }}"
chmod 600 ~/.ssh/google_compute_engine
gcloud compute os-login ssh-keys add \
--key-file ~/.ssh/google_compute_engine.pub \
--impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
--ttl=6h

- name: Submit shuffle benchmark
id: submit
shell: bash -l {0}
run: |
JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job run --no-wait --priority production \
--memory=2G --disk=8G --cpu=1 --extra=cpu \
-e SMOKE_RUN_ID "$RUN_ID" \
-- python lib/zephyr/tests/benchmark_shuffle.py \
--num-input-shards "$NUM_INPUT_SHARDS" \
--items-per-shard "${{ matrix.items_per_shard }}" \
--item-bytes "${{ matrix.item_bytes }}" \
--num-keys 50000 \
--max-workers 4 --worker-cpu 1 --worker-ram 8g \
--hot-shard-frac "${{ matrix.hot_shard_frac }}" \
--hot-key-pool "${{ matrix.hot_key_pool }}" \
--repeat 3 \
--label "$RUN_ID")
echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
echo "Submitted job: $JOB_ID"

- name: Wait for shuffle benchmark
shell: bash -l {0}
run: |
JOB_ID="${{ steps.submit.outputs.job_id }}"
echo "Polling job status: $JOB_ID"
while true; do
STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job list --json --prefix "$JOB_ID" \
| jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
case "$STATE" in
JOB_STATE_SUCCEEDED)
echo "Job succeeded"
exit 0
;;
JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
echo "$(date -u +%H:%M:%S) Job state: $STATE"
sleep 30
;;
"")
echo "Job not found: $JOB_ID"
exit 1
;;
*)
echo "Job finished with state: $STATE"
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job list --json --prefix "$JOB_ID" \
| jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
exit 1
;;
esac
done

- name: Print benchmark results
if: success()
shell: bash -l {0}
run: |
JOB_ID="${{ steps.submit.outputs.job_id }}"
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job logs "$JOB_ID" --max-lines 200 2>/dev/null \
| grep "RESULT:" || echo "No RESULT lines found"

- name: Capture failure diagnostics
if: failure()
shell: bash -l {0}
run: |
JOB_ID="${{ steps.submit.outputs.job_id }}"
echo "=== Job summary ==="
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job summary "$JOB_ID" 2>/dev/null || true
echo "=== Recent logs ==="
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job logs "$JOB_ID" --max-lines 100 2>/dev/null | tail -60 || true

- name: Remove OS Login SSH key
if: always()
run: |
gcloud compute os-login ssh-keys remove \
--impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
--key-file ~/.ssh/google_compute_engine.pub || true
Loading
Loading