|
| 1 | +name: Launch workload on GKE with XPK |
| 2 | + |
| 3 | +description: "Launch a JobSet workload on GKE with XPK. Upload artifacts from container to GCS and GitHub Actions." |
| 4 | + |
| 5 | +inputs: |
| 6 | + GCP_PROJECT: |
| 7 | + description: 'GCP project ID' |
| 8 | + default: nv-jaxtoolboxgcp-20240925 |
| 9 | + type: string |
| 10 | + GKE_CLUSTER: |
| 11 | + description: 'GKE cluster name' |
| 12 | + default: jtb-2025-06-12 |
| 13 | + required: false |
| 14 | + type: string |
| 15 | + GCP_ZONE: |
| 16 | + description: 'GCP zone of the cluster' |
| 17 | + default: us-central1-a |
| 18 | + required: false |
| 19 | + type: string |
| 20 | + CLUSTER_DEVICE: |
| 21 | + description: 'GPU device type in the cluster' |
| 22 | + default: h100-mega-80gb-8 |
| 23 | + required: false |
| 24 | + type: string |
| 25 | + NUM_NODES: |
| 26 | + description: 'Number of nodes to use in JobSet (n.b each a3-megagpu-8g node has 8xGPU)' |
| 27 | + default: 2 |
| 28 | + required: false |
| 29 | + type: string |
| 30 | + MAIN_CONTAINER_NAME: |
| 31 | + description: 'Name of the main contianer in an XPK JobSet (fixed)' |
| 32 | + default: gpu-image |
| 33 | + required: false |
| 34 | + type: string |
| 35 | + CONTAINER_OUTPUT_PATH: |
| 36 | + description: 'Output directory for artifacts' |
| 37 | + default: /opt/output |
| 38 | + required: false |
| 39 | + type: string |
| 40 | + GCS_BUCKET: |
| 41 | + description: 'GCS bucket to which CI output artifacts will be uploaded' |
| 42 | + default: jaxtoolbox-ci |
| 43 | + required: false |
| 44 | + type: string |
| 45 | + IMAGE: |
| 46 | + description: 'URI of image to use in JobSet' |
| 47 | + required: false |
| 48 | + default: ghcr.io/nvidia/jax:latest |
| 49 | + type: string |
| 50 | + COMMAND: |
| 51 | + description: 'Command to run in main container on JobSet start up' |
| 52 | + required: false |
| 53 | + default: 'nvidia-smi; free -h;' |
| 54 | + type: string |
| 55 | + EXIT_COMMAND: |
| 56 | + description: 'Command to set exit code' |
| 57 | + required: false |
| 58 | + default: 'exit \$EXIT_CODE' |
| 59 | + type: string |
| 60 | + WORKLOAD_NAME_PREFIX: |
| 61 | + description: 'Workload name prefix for XPK, also used to name uploaded artifact' |
| 62 | + required: false |
| 63 | + default: 'xpk' |
| 64 | + type: string |
| 65 | + XPK_VERSION: |
| 66 | + description: 'XPK release tag' |
| 67 | + required: false |
| 68 | + default: 'v0.8.0' |
| 69 | + type: string |
| 70 | + XPK_PYTHON: |
| 71 | + description: 'Python version for XPK' |
| 72 | + required: false |
| 73 | + default: '3.12.10' |
| 74 | + type: string |
| 75 | + |
| 76 | +runs: |
| 77 | + using: 'composite' |
| 78 | + steps: |
| 79 | + |
| 80 | + - name: Set workload name |
| 81 | + shell: bash -x -e -u {0} |
| 82 | + run: | |
| 83 | + WORKLOAD_NAME="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" |
| 84 | + DATE=$(date +'%Y-%m-%d') |
| 85 | + GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME}" |
| 86 | +
|
| 87 | + echo "WORKLOAD_NAME=${WORKLOAD_NAME}" >> ${GITHUB_ENV} |
| 88 | + echo "DATE=${DATE}" >> ${GITHUB_ENV} |
| 89 | + echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV} |
| 90 | +
|
| 91 | + - name: Setup environment |
| 92 | + shell: bash -x -e -u {0} |
| 93 | + run: | |
| 94 | + mkdir -p ${WORKLOAD_NAME} |
| 95 | + uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME} |
| 96 | + source ${WORKLOAD_NAME}/.venv/bin/activate |
| 97 | +
|
| 98 | + # install xpk |
| 99 | + git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME}/xpk |
| 100 | +
|
| 101 | + sed 's@pip install \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME}/xpk/Makefile |
| 102 | + cd ${WORKLOAD_NAME}/xpk && sudo make install; cd - |
| 103 | +
|
| 104 | + - name: Show environment |
| 105 | + shell: bash -x -e -u {0} |
| 106 | + run: | |
| 107 | + gcloud version |
| 108 | + |
| 109 | + source ${WORKLOAD_NAME}/.venv/bin/activate |
| 110 | + python --version |
| 111 | + xpk version |
| 112 | + |
| 113 | + - name: Apply XPK workload create patch |
| 114 | + shell: bash -x -e -u {0} |
| 115 | + run: | |
| 116 | + git apply --unsafe-paths .github/gke-workflow/xpk/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk |
| 117 | + git apply --unsafe-paths .github/gke-workflow/xpk/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk |
| 118 | + git apply --unsafe-paths .github/gke-workflow/xpk/workload.patch --directory ${WORKLOAD_NAME}/xpk |
| 119 | + |
| 120 | + - name: Set workload commands |
| 121 | + shell: bash -x -e -u {0} |
| 122 | + run: | |
| 123 | + PRELUDE=" |
| 124 | + apt install -y ripgrep > /dev/null; |
| 125 | + curl -LO https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz; |
| 126 | + tar xf google-cloud-cli-linux-x86_64.tar.gz; |
| 127 | + ./google-cloud-sdk/install.sh --quiet > /dev/null; |
| 128 | + ./google-cloud-sdk/bin/gcloud init; |
| 129 | + |
| 130 | + mkdir -p /usr/share/workload; |
| 131 | + mkdir -p ${{ inputs.CONTAINER_OUTPUT_PATH }}; |
| 132 | + " |
| 133 | + |
| 134 | + POSTLUDE=" |
| 135 | + ./google-cloud-sdk/bin/gsutil cp -r ${{ inputs.CONTAINER_OUTPUT_PATH }}/ ${GCS_ARTIFACT_PATH}/node-0\$NODE_RANK; |
| 136 | + ${{ inputs.EXIT_COMMAND }} |
| 137 | + " |
| 138 | + |
| 139 | + CMD="${{ inputs.COMMAND }}" |
| 140 | + |
| 141 | + # set container commands in-line |
| 142 | + PRELUDE=$(echo ${PRELUDE} | sed 's/\n/\ /g') |
| 143 | + POSTLUDE=$(echo ${POSTLUDE} | sed 's/\n/\ /g') |
| 144 | + CMD=$(echo ${CMD} | sed 's/\n/\ /g') |
| 145 | + |
| 146 | + echo "PRELUDE=${PRELUDE}" >> ${GITHUB_ENV} |
| 147 | + echo "CMD=${CMD}" >> ${GITHUB_ENV} |
| 148 | + echo "POSTLUDE=${POSTLUDE}" >> ${GITHUB_ENV} |
| 149 | + |
| 150 | + - name: Create workload on cluster with XPK |
| 151 | + shell: bash -x -e -u {0} |
| 152 | + run: | |
| 153 | + source ${WORKLOAD_NAME}/.venv/bin/activate |
| 154 | + cd ${WORKLOAD_NAME}/xpk |
| 155 | + python xpk.py workload create \ |
| 156 | + --project ${{ inputs.GCP_PROJECT }} \ |
| 157 | + --cluster ${{ inputs.GKE_CLUSTER }} \ |
| 158 | + --zone ${{ inputs.GCP_ZONE }} \ |
| 159 | + --workload ${WORKLOAD_NAME} \ |
| 160 | + --docker-image ${{ inputs.IMAGE }} \ |
| 161 | + --device-type ${{ inputs.CLUSTER_DEVICE }} \ |
| 162 | + --num-nodes ${{ inputs.NUM_NODES }} \ |
| 163 | + --num-slices ${{ inputs.NUM_NODES }} \ |
| 164 | + --priority=high \ |
| 165 | + --scheduler=gke.io/topology-aware-auto \ |
| 166 | + --command "${PRELUDE} ${CMD} ${POSTLUDE}" |
| 167 | + |
| 168 | + - name: Wait for JobSet to unsuspend on cluster |
| 169 | + shell: bash -u {0} |
| 170 | + env: |
| 171 | + POLL_TIMEOUT: 3600 |
| 172 | + run: | |
| 173 | + START=$(date +%s) |
| 174 | + JOBSET_ACTIVE=false |
| 175 | + while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do |
| 176 | + JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1') |
| 177 | + NOW=$(date +%s) |
| 178 | + ELAPSED=$(( NOW - START )) |
| 179 | + if (( ELAPSED > POLL_TIMEOUT )) ; then |
| 180 | + echo "Timeout after waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}" |
| 181 | + exit 1 |
| 182 | + fi |
| 183 | + echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}" |
| 184 | + sleep 5 |
| 185 | + done |
| 186 | + |
| 187 | + echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}" |
| 188 | + |
| 189 | + - name: Set JobSet Pod name |
| 190 | + shell: bash -u {0} |
| 191 | + run: | |
| 192 | + echo "POD=$(kubectl get pods -o json | jq -r '.items[] | select(.metadata.labels."'jobset.sigs.k8s.io/jobset-name'" == "'${WORKLOAD_NAME}'") | .metadata.name ' | sort | head -n1 )" >> ${GITHUB_ENV} |
| 193 | + |
| 194 | + - name: Wait for JobSet Pod readiness |
| 195 | + shell: bash -u {0} |
| 196 | + run: | |
| 197 | + POD_READY=false |
| 198 | + while ! ${POD_READY} || [ -z ${POD_READY} ]; do |
| 199 | + echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready" |
| 200 | + sleep 10 |
| 201 | + |
| 202 | + POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))') |
| 203 | + if ${POD_ERROR} ; then |
| 204 | + echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}" |
| 205 | + break |
| 206 | + fi |
| 207 | + |
| 208 | + POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready') |
| 209 | + done; |
| 210 | + |
| 211 | + - name: Stream logs from JobSet Pods |
| 212 | + shell: bash -u {0} |
| 213 | + run: | |
| 214 | + jobset_pods=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' ')) |
| 215 | + |
| 216 | + for jobset_pod in ${jobset_pods[@]}; do |
| 217 | + kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME}-${jobset_pod}-jobset.log & |
| 218 | + done |
| 219 | + wait < <(jobs -p) |
| 220 | + |
| 221 | + - name: Set exit code from JobSet logs |
| 222 | + shell: bash -u {0} |
| 223 | + run: | |
| 224 | + MAYBE_XPK_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME}-${POD}-jobset.log | awk '{ print $3 }' )" |
| 225 | + echo ${MAYBE_XPK_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$' |
| 226 | + |
| 227 | + if [ $? -ne 0 ]; then |
| 228 | + echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected " |
| 229 | + exit 1 |
| 230 | + fi |
| 231 | + |
| 232 | + eval "export ${MAYBE_XPK_EXIT_CODE}" |
| 233 | + exit ${EXIT_CODE} |
| 234 | + |
| 235 | + - name: Clean up JobSet from cluster |
| 236 | + shell: bash -x -u {0} |
| 237 | + if: ${{ always() }} |
| 238 | + run: | |
| 239 | + kubectl delete jobset --wait ${WORKLOAD_NAME} || echo "JobSet ${WORKLOAD_NAME} does not exist in ${{ inputs.GKE_CLUSTER }}" |
| 240 | + |
| 241 | + - name: Download artifacts from GCS to runner |
| 242 | + shell: bash -x -u {0} |
| 243 | + run: | |
| 244 | + mkdir -p output/${WORKLOAD_NAME} |
| 245 | + mv ${WORKLOAD_NAME}-*.log output/${WORKLOAD_NAME} |
| 246 | + gsutil cp -r ${GCS_ARTIFACT_PATH} output/${WORKLOAD_NAME} |
| 247 | + |
| 248 | + - name: Upload artifacts to GitHub Actions from runner |
| 249 | + uses: actions/upload-artifact@v4 |
| 250 | + with: |
| 251 | + name: ${{ inputs.WORKLOAD_NAME_PREFIX }} |
| 252 | + path: output/${{ env.WORKLOAD_NAME }}/* |
| 253 | + |
| 254 | + - name: Clean up GCS artifacts from runner |
| 255 | + shell: bash -x -u {0} |
| 256 | + if: ${{ always() }} |
| 257 | + run: | |
| 258 | + rm -rf output/${WORKLOAD_NAME} |
| 259 | +
|
| 260 | + - name: Clean up xpk environment from runner |
| 261 | + shell: bash -x -u {0} |
| 262 | + if: ${{ always() }} |
| 263 | + run: | |
| 264 | + sudo rm -rf ${WORKLOAD_NAME} |
0 commit comments