Skip to content

Commit efb11b7

Browse files
aybchanolupton
andauthored
Add GKE example (#1481)
Add `GKE` `MaxText` train ([example run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15744603099/job/44379358307)) and `NCCL` test ([example run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15744603099/job/44378422712)) workflows with reusable composite action for managing `xpk` job lifecycle (launch, logs streaming, clean up, artifact upload). Patches on `xpk` address the following identified issues: - AI-Hypercomputer/xpk#476 - AI-Hypercomputer/xpk#488 - AI-Hypercomputer/xpk#490 - AI-Hypercomputer/xpk#491 - AI-Hypercomputer/xpk#492 Cluster create with `xpk` ([example run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15591134618/job/43910254644#step:5:1)) - added as a separate [workflow](https://github.com/NVIDIA/JAX-Toolbox/pull/1481/files#diff-801fc28cafbf1e0fa0ea521355fa8a1c9e6c01dcb8b1083c47f66e2ead4d560a) for demonstration purposes (will not be operational in the CI) --------- Co-authored-by: Olli Lupton <[email protected]>
1 parent 6349f72 commit efb11b7

17 files changed

+891
-1
lines changed

.github/actions/gke-xpk/action.yml

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
name: Launch workload on GKE with XPK
2+
3+
description: "Launch a JobSet workload on GKE with XPK. Upload artifacts from container to GCS and GitHub Actions."
4+
5+
inputs:
6+
GCP_PROJECT:
7+
description: 'GCP project ID'
8+
default: nv-jaxtoolboxgcp-20240925
9+
type: string
10+
GKE_CLUSTER:
11+
description: 'GKE cluster name'
12+
default: jtb-2025-06-12
13+
required: false
14+
type: string
15+
GCP_ZONE:
16+
description: 'GCP zone of the cluster'
17+
default: us-central1-a
18+
required: false
19+
type: string
20+
CLUSTER_DEVICE:
21+
description: 'GPU device type in the cluster'
22+
default: h100-mega-80gb-8
23+
required: false
24+
type: string
25+
NUM_NODES:
26+
description: 'Number of nodes to use in JobSet (n.b each a3-megagpu-8g node has 8xGPU)'
27+
default: 2
28+
required: false
29+
type: string
30+
MAIN_CONTAINER_NAME:
31+
description: 'Name of the main contianer in an XPK JobSet (fixed)'
32+
default: gpu-image
33+
required: false
34+
type: string
35+
CONTAINER_OUTPUT_PATH:
36+
description: 'Output directory for artifacts'
37+
default: /opt/output
38+
required: false
39+
type: string
40+
GCS_BUCKET:
41+
description: 'GCS bucket to which CI output artifacts will be uploaded'
42+
default: jaxtoolbox-ci
43+
required: false
44+
type: string
45+
IMAGE:
46+
description: 'URI of image to use in JobSet'
47+
required: false
48+
default: ghcr.io/nvidia/jax:latest
49+
type: string
50+
COMMAND:
51+
description: 'Command to run in main container on JobSet start up'
52+
required: false
53+
default: 'nvidia-smi; free -h;'
54+
type: string
55+
EXIT_COMMAND:
56+
description: 'Command to set exit code'
57+
required: false
58+
default: 'exit \$EXIT_CODE'
59+
type: string
60+
WORKLOAD_NAME_PREFIX:
61+
description: 'Workload name prefix for XPK, also used to name uploaded artifact'
62+
required: false
63+
default: 'xpk'
64+
type: string
65+
XPK_VERSION:
66+
description: 'XPK release tag'
67+
required: false
68+
default: 'v0.8.0'
69+
type: string
70+
XPK_PYTHON:
71+
description: 'Python version for XPK'
72+
required: false
73+
default: '3.12.10'
74+
type: string
75+
76+
runs:
77+
using: 'composite'
78+
steps:
79+
80+
- name: Set workload name
81+
shell: bash -x -e -u {0}
82+
run: |
83+
WORKLOAD_NAME="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
84+
DATE=$(date +'%Y-%m-%d')
85+
GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME}"
86+
87+
echo "WORKLOAD_NAME=${WORKLOAD_NAME}" >> ${GITHUB_ENV}
88+
echo "DATE=${DATE}" >> ${GITHUB_ENV}
89+
echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV}
90+
91+
- name: Setup environment
92+
shell: bash -x -e -u {0}
93+
run: |
94+
mkdir -p ${WORKLOAD_NAME}
95+
uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME}
96+
source ${WORKLOAD_NAME}/.venv/bin/activate
97+
98+
# install xpk
99+
git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME}/xpk
100+
101+
sed 's@pip install \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME}/xpk/Makefile
102+
cd ${WORKLOAD_NAME}/xpk && sudo make install; cd -
103+
104+
- name: Show environment
105+
shell: bash -x -e -u {0}
106+
run: |
107+
gcloud version
108+
109+
source ${WORKLOAD_NAME}/.venv/bin/activate
110+
python --version
111+
xpk version
112+
113+
- name: Apply XPK workload create patch
114+
shell: bash -x -e -u {0}
115+
run: |
116+
git apply --unsafe-paths .github/gke-workflow/xpk/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk
117+
git apply --unsafe-paths .github/gke-workflow/xpk/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk
118+
git apply --unsafe-paths .github/gke-workflow/xpk/workload.patch --directory ${WORKLOAD_NAME}/xpk
119+
120+
- name: Set workload commands
121+
shell: bash -x -e -u {0}
122+
run: |
123+
PRELUDE="
124+
apt install -y ripgrep > /dev/null;
125+
curl -LO https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz;
126+
tar xf google-cloud-cli-linux-x86_64.tar.gz;
127+
./google-cloud-sdk/install.sh --quiet > /dev/null;
128+
./google-cloud-sdk/bin/gcloud init;
129+
130+
mkdir -p /usr/share/workload;
131+
mkdir -p ${{ inputs.CONTAINER_OUTPUT_PATH }};
132+
"
133+
134+
POSTLUDE="
135+
./google-cloud-sdk/bin/gsutil cp -r ${{ inputs.CONTAINER_OUTPUT_PATH }}/ ${GCS_ARTIFACT_PATH}/node-0\$NODE_RANK;
136+
${{ inputs.EXIT_COMMAND }}
137+
"
138+
139+
CMD="${{ inputs.COMMAND }}"
140+
141+
# set container commands in-line
142+
PRELUDE=$(echo ${PRELUDE} | sed 's/\n/\ /g')
143+
POSTLUDE=$(echo ${POSTLUDE} | sed 's/\n/\ /g')
144+
CMD=$(echo ${CMD} | sed 's/\n/\ /g')
145+
146+
echo "PRELUDE=${PRELUDE}" >> ${GITHUB_ENV}
147+
echo "CMD=${CMD}" >> ${GITHUB_ENV}
148+
echo "POSTLUDE=${POSTLUDE}" >> ${GITHUB_ENV}
149+
150+
- name: Create workload on cluster with XPK
151+
shell: bash -x -e -u {0}
152+
run: |
153+
source ${WORKLOAD_NAME}/.venv/bin/activate
154+
cd ${WORKLOAD_NAME}/xpk
155+
python xpk.py workload create \
156+
--project ${{ inputs.GCP_PROJECT }} \
157+
--cluster ${{ inputs.GKE_CLUSTER }} \
158+
--zone ${{ inputs.GCP_ZONE }} \
159+
--workload ${WORKLOAD_NAME} \
160+
--docker-image ${{ inputs.IMAGE }} \
161+
--device-type ${{ inputs.CLUSTER_DEVICE }} \
162+
--num-nodes ${{ inputs.NUM_NODES }} \
163+
--num-slices ${{ inputs.NUM_NODES }} \
164+
--priority=high \
165+
--scheduler=gke.io/topology-aware-auto \
166+
--command "${PRELUDE} ${CMD} ${POSTLUDE}"
167+
168+
- name: Wait for JobSet to unsuspend on cluster
169+
shell: bash -u {0}
170+
env:
171+
POLL_TIMEOUT: 3600
172+
run: |
173+
START=$(date +%s)
174+
JOBSET_ACTIVE=false
175+
while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do
176+
JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1')
177+
NOW=$(date +%s)
178+
ELAPSED=$(( NOW - START ))
179+
if (( ELAPSED > POLL_TIMEOUT )) ; then
180+
echo "Timeout after waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
181+
exit 1
182+
fi
183+
echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
184+
sleep 5
185+
done
186+
187+
echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
188+
189+
- name: Set JobSet Pod name
190+
shell: bash -u {0}
191+
run: |
192+
echo "POD=$(kubectl get pods -o json | jq -r '.items[] | select(.metadata.labels."'jobset.sigs.k8s.io/jobset-name'" == "'${WORKLOAD_NAME}'") | .metadata.name ' | sort | head -n1 )" >> ${GITHUB_ENV}
193+
194+
- name: Wait for JobSet Pod readiness
195+
shell: bash -u {0}
196+
run: |
197+
POD_READY=false
198+
while ! ${POD_READY} || [ -z ${POD_READY} ]; do
199+
echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready"
200+
sleep 10
201+
202+
POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
203+
if ${POD_ERROR} ; then
204+
echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}"
205+
break
206+
fi
207+
208+
POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready')
209+
done;
210+
211+
- name: Stream logs from JobSet Pods
212+
shell: bash -u {0}
213+
run: |
214+
jobset_pods=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' '))
215+
216+
for jobset_pod in ${jobset_pods[@]}; do
217+
kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME}-${jobset_pod}-jobset.log &
218+
done
219+
wait < <(jobs -p)
220+
221+
- name: Set exit code from JobSet logs
222+
shell: bash -u {0}
223+
run: |
224+
MAYBE_XPK_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME}-${POD}-jobset.log | awk '{ print $3 }' )"
225+
echo ${MAYBE_XPK_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$'
226+
227+
if [ $? -ne 0 ]; then
228+
echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
229+
exit 1
230+
fi
231+
232+
eval "export ${MAYBE_XPK_EXIT_CODE}"
233+
exit ${EXIT_CODE}
234+
235+
- name: Clean up JobSet from cluster
236+
shell: bash -x -u {0}
237+
if: ${{ always() }}
238+
run: |
239+
kubectl delete jobset --wait ${WORKLOAD_NAME} || echo "JobSet ${WORKLOAD_NAME} does not exist in ${{ inputs.GKE_CLUSTER }}"
240+
241+
- name: Download artifacts from GCS to runner
242+
shell: bash -x -u {0}
243+
run: |
244+
mkdir -p output/${WORKLOAD_NAME}
245+
mv ${WORKLOAD_NAME}-*.log output/${WORKLOAD_NAME}
246+
gsutil cp -r ${GCS_ARTIFACT_PATH} output/${WORKLOAD_NAME}
247+
248+
- name: Upload artifacts to GitHub Actions from runner
249+
uses: actions/upload-artifact@v4
250+
with:
251+
name: ${{ inputs.WORKLOAD_NAME_PREFIX }}
252+
path: output/${{ env.WORKLOAD_NAME }}/*
253+
254+
- name: Clean up GCS artifacts from runner
255+
shell: bash -x -u {0}
256+
if: ${{ always() }}
257+
run: |
258+
rm -rf output/${WORKLOAD_NAME}
259+
260+
- name: Clean up xpk environment from runner
261+
shell: bash -x -u {0}
262+
if: ${{ always() }}
263+
run: |
264+
sudo rm -rf ${WORKLOAD_NAME}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
ARG BASE_IMAGE
2+
FROM ${BASE_IMAGE} as mealkit
3+
FROM mealkit as final
4+
COPY .github/gke-workflow/nccl/scripts /scripts
5+
RUN apt-get update \
6+
&& apt install -y openssh-server
7+
RUN passwd -d root && \
8+
echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \
9+
echo "PermitEmptyPasswords yes" >> /etc/ssh/sshd_config && \
10+
echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \
11+
chmod +x /scripts/*
12+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
len() {
2+
local -r arr=($@)
3+
echo "${#arr[@]}"
4+
}
5+
6+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
7+
8+
NRANKS_FACTORS=(1 2 4 8)
9+
10+
NHOSTS=$(len "$@")
11+
echo "generating hostfiles for ${NHOSTS} hosts: "
12+
for h in "$@"; do echo "$h"; done
13+
14+
mkdir -p "${SCRIPT_DIR}/hostfiles${NHOSTS}"
15+
16+
for nr in "${NRANKS_FACTORS[@]}";
17+
do
18+
rm -f "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}"
19+
touch "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}"
20+
for h in "$@";
21+
do
22+
echo "$h port=22 slots=${nr}" >> "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}"
23+
done
24+
done
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
BENCHMARK=$1
2+
NHOSTS=${NHOSTS:-2}
3+
shift
4+
5+
/scripts/start_ssh.sh ${@};
6+
pushd /scripts;
7+
8+
/scripts/generate_hostfiles.sh ${@};
9+
popd;
10+
11+
COMPLETION_FLAG=/opt/output/${BENCHMARK}_done
12+
13+
service ssh restart
14+
15+
if [ $NODE_RANK = 0 ] ; then
16+
for host in ${@}; do
17+
host_ready=false
18+
while ! $host_ready; do
19+
status=$(ssh $host echo "ready" 2> /dev/null || echo "unready")
20+
if [ "$status" = "ready" ]; then
21+
host_ready=true
22+
break
23+
fi
24+
echo "$host not ready"
25+
sleep 5
26+
done
27+
echo "$host ready"
28+
done
29+
30+
NCCL_BENCHMARK=$BENCHMARK NHOSTS=$NHOSTS /scripts/test.sh
31+
32+
for host in ${@}; do
33+
ssh ${host} touch ${COMPLETION_FLAG}
34+
done
35+
36+
else
37+
while [ ! -f $COMPLETION_FLAG ]; do sleep 10; done
38+
fi
39+
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
PORT=${PORT:-22}
2+
3+
while true; do
4+
host=$1
5+
if [[ -z $host ]]; then
6+
break
7+
fi
8+
ssh -p "${PORT}" "$host" \
9+
echo "Connected to ${host}"
10+
shift
11+
done

0 commit comments

Comments
 (0)