Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1328feb
Update xpk CLI command
aybchan Dec 23, 2025
5aab510
Enable GPU node private
aybchan Jan 6, 2026
1e06b07
Add nccl-installer nccl install disable patch
aybchan Jan 6, 2026
6bb47e9
Add modified nccl installer manifest
aybchan Jan 6, 2026
0eab4c8
Set tcpxo installer
aybchan Jan 6, 2026
0c32231
Add Dockerfile patch for image cluster toolkit
aybchan Jan 7, 2026
34a337d
Remove xpk patches
aybchan Jan 7, 2026
d0d44d5
Add comment
aybchan Jan 7, 2026
0f2f266
Add nccl-tcpxo-installer manifest example
aybchan Jan 7, 2026
1979da5
Remove work-around for GCP NCCL plugin compatibility
aybchan Jan 7, 2026
fdcc953
Add cluster GPU node availability check
aybchan Jan 8, 2026
d7fb0e6
Merge branch 'main' into aybchan/upgrade-xpk-v1.0.0
aybchan Jan 8, 2026
ca40c39
Add missing condition, handle empty JobSet exit
aybchan Jan 8, 2026
f4066e8
Skip clean up
aybchan Jan 8, 2026
6bd848a
Set location as region
aybchan Jan 8, 2026
62b90e4
Add cluster secret operation into GKE composite action
aybchan Jan 9, 2026
47337f9
Pass secrets to composite action
aybchan Jan 9, 2026
630807b
Set image pull secret name for JobSet
aybchan Jan 9, 2026
e0d315f
Set to check status of all jobset pods
aybchan Jan 9, 2026
67d047c
Fix output inheritance issue in nested composite action
aybchan Jan 9, 2026
88bc2a3
Test empty cluster
aybchan Jan 12, 2026
44b7700
Remove unset workload constraint, trigger on latest cluster
aybchan Jan 13, 2026
6755c84
Merge branch 'main' into aybchan/upgrade-xpk-v1.0.0
aybchan Jan 14, 2026
894c550
Merge branch 'aybchan/upgrade-xpk-v1.0.0' of github.com:NVIDIA/JAX-To…
aybchan Jan 14, 2026
fa2fbc5
Fix merge
aybchan Jan 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 160 additions & 105 deletions .github/actions/gke-xpk/action.yml

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions .github/actions/store-delete-k8s-ghcr/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,18 @@ runs:
shell: bash
id: token
run: |
echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT
- name: Delete GitHub Container Registry token
K8S_SECRET_NAME="${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
echo "token-name=${K8S_SECRET_NAME}" >> $GITHUB_OUTPUT
echo "K8S_SECRET_NAME=${K8S_SECRET_NAME}" >> ${GITHUB_ENV}

- name: Store and delete secret on Kubernetes cluster
uses: ./.github/actions/with-post-step
with:
main: |
# Store GitHub Container Registry token as Kubernetes secret
kubectl create secret generic \
${{ steps.token.outputs.token-name }} \
${K8S_SECRET_NAME} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson

post: |
kubectl delete secret ${{ steps.token.outputs.token-name }}
kubectl delete secret ${K8S_SECRET_NAME}
14 changes: 14 additions & 0 deletions .github/gke-workflow/xpk/v1.0.0/Dockerfile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
diff --git a/data/Dockerfile b/data/Dockerfile
index 95fd9a4..2820ce9 100644
--- a/data/Dockerfile
+++ b/data/Dockerfile
@@ -57,6 +57,9 @@ ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin

# Clone the Cluster Toolkit repository
RUN git clone https://github.com/GoogleCloudPlatform/cluster-toolkit.git /cluster-toolkit
+# (WIP) Replace nccl-tcpxo-installer pod manifest from GCP to modified version without --nccl-install flag
+# there is probably a nicer way to handle this
+RUN sed 's@https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/39308db7574925ea3c14f9113fcf87f70a6fcc26/gpudirect-tcpxo/nccl-tcpxo-installer.yaml@https://raw.githubusercontent.com/NVIDIA/JAX-Toolbox/0eab4c8f4a445b1c71ee3d52ece6d0d3f2b5a28d/.github/gke-workflow/xpk/v1.0.0/nccl-tcpxo-installer.yaml@g -i /cluster-toolkit/modules/compute/gke-node-pool/gpu_direct.tf

# Build the Cluster Toolkit
WORKDIR /cluster-toolkit
15 changes: 15 additions & 0 deletions .github/gke-workflow/xpk/v1.0.0/blueprint.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py
index 1f4c0c4..147997d 100644
--- a/src/xpk/core/blueprint/blueprint_generator.py
+++ b/src/xpk/core/blueprint/blueprint_generator.py
@@ -225,7 +225,9 @@ class BlueprintGenerator:
"type": "nvidia-h100-mega-80gb",
"count": 8,
"gpu_driver_installation_config": {
- "gpu_driver_version": "LATEST"
+ # avoid using LATEST due to GPU driver forward compatibility support
+ # not available when using a feature branch driver release
+ "gpu_driver_version": "INSTALLATION_DISABLED"
},
}],
"auto_upgrade": (
67 changes: 67 additions & 0 deletions .github/gke-workflow/xpk/v1.0.0/nccl-tcpxo-installer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nccl-tcpx-installer
namespace: kube-system
labels:
k8s-app: nccl-tcpx-installer
spec:
selector:
matchLabels:
k8s-app: nccl-tcpx-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nccl-tcpx-installer
k8s-app: nccl-tcpx-installer
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-h100-80gb
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: var-lib
hostPath:
path: /var/lib
- name: tcpx
hostPath:
path: /var/lib/tcpx
- name: library-dir-host
hostPath:
path: /home/kubernetes/bin
initContainers:
- image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9
name: nccl-tcpx-installer
resources:
requests:
cpu: 150m
securityContext:
privileged: true
volumeMounts:
- name: var-lib
mountPath: /var/lib
- name: library-dir-host
mountPath: /usr/local
command: ["/bin/sh", "-c"]
args:
- |
set -ex
/scripts/container_entry.sh install
mkdir -p /usr/local/nvidia/lib64
cp -r /var/lib/tcpx/lib64/. /usr/local/nvidia/lib64
echo "installation finishes"
containers:
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
name: pause
20 changes: 11 additions & 9 deletions .github/workflows/_create_gke_cluster_xpk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,17 @@ jobs:
if ! [ $CLUSTER_EXISTS = true ]; then
cd $HOME/xpk
source $HOME/.venv/bin/activate
python xpk.py cluster create \
--cluster ${{ inputs.CLUSTER_NAME }} \
--gke-version ${{ inputs.GKE_VERSION }} \
--device-type ${{ inputs.DEVICE_TYPE }} \
--num-nodes ${{ inputs.NUM_NODES }} \
--default-pool-cpu-machine-type=${{ inputs.DEFAULT_CPU_MACHINE }} \
--project=${{ inputs.GCP_PROJECT }} \
--reservation ${{ inputs.GCP_GCE_RESERVATION }} \
--zone ${{ inputs.GCP_ZONE }}
xpk cluster create --cluster ${{ inputs.CLUSTER_NAME }} \
--gke-version ${{ inputs.GKE_VERSION }} \
--device-type ${{ inputs.DEVICE_TYPE }} \
--num-nodes ${{ inputs.NUM_NODES }} \
--default-pool-cpu-machine-type=${{ inputs.DEFAULT_CPU_MACHINE }} \
--project=${{ inputs.GCP_PROJECT }} \
--reservation ${{ inputs.GCP_GCE_RESERVATION }} \
--zone ${{ inputs.GCP_ZONE }} \
--custom-cluster-arguments="--enable-private-nodes" \
--custom-nodepool-arguments="--enable-private-nodes" \
--private
else
echo "Cluster ${{ inputs.CLUSTER_NAME }} already exists, skipping creation"
fi
Expand Down
35 changes: 9 additions & 26 deletions .github/workflows/_test_maxtext_gke_xpk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: gke-a3mega

env:
WORKLOAD_NAME_PREFIX: gke-maxtext-train
WORKLOAD_PREFIX: gke-maxtext-train
MAXTEXT_MODEL: llama2-7b
MAXTEXT_ATTENTION_TYPE: cudnn_flash_te
MAXTEXT_REMAT_POLICY: minimal_flash
Expand All @@ -26,35 +26,18 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Login to nvcr.io Container Registry
uses: docker/login-action@v3
with:
registry: nvcr.io
username: $oauthtoken
password: ${{ secrets.NVCR_TOKEN }}

- name: K8s GHCR store and delete token
id: store-token
uses: ./.github/actions/store-delete-k8s-ghcr

- name: Run XPK workload on cluster
uses: ./.github/actions/gke-xpk
with:
IMAGE: ${{ env.MAXTEXT_IMAGE }}
IMAGE_PULL_SECRET_NAME: ${{ steps.store-token.outputs.token-name }}
WORKLOAD_NAME_PREFIX: ${{ env.WORKLOAD_NAME_PREFIX }}
ENVS: |
JAX_COORDINATOR_PORT=3389;
JAX_COORDINATOR_ADDRESS=\$(JOBSET_NAME)-\$(REPLICATED_JOB_NAME)-0-0.\$(JOBSET_NAME):\$(JAX_COORDINATOR_PORT);
console=/dev/stdout;
WORKLOAD_PREFIX: ${{ env.WORKLOAD_PREFIX }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NVCR_TOKEN: ${{ secrets.NVCR_TOKEN }}
ENVS:
NCCL_NET_PLUGIN=/opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so;
NCCL_TUNER_PLUGIN=none;
COMMAND: |

nsys-jax --capture-range=cudaProfilerApi
--capture-range-end=stop
-o /opt/output/profile.zip
Expand All @@ -75,5 +58,5 @@ jobs:
upload_all_profiler_results=true
skip_first_n_steps_for_profiler=3
profiler_steps=8' |&
tee /opt/output/output.log &> \${console};
tee /opt/output/output.log &> /dev/stdout;
EXIT_CODE=\$PIPESTATUS;
31 changes: 10 additions & 21 deletions .github/workflows/_test_nccl_gke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
env:
BASE_IMAGE: ${{ needs.build-nccl-gke.outputs.DOCKER_TAG_FINAL }}
TEST_NAME: ${{ matrix.test }}
WORKLOAD_NAME_PREFIX: nccl-gke
WORKLOAD_PREFIX: nccl-gke
NHOSTS: 2
NCCL_MINBYTES: 8
NCCL_MAXBYTES: 16G
Expand All @@ -62,53 +62,42 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: K8s GHCR store and delete token
id: store-token
uses: ./.github/actions/store-delete-k8s-ghcr

- name: Set workload name prefix # due to 40 char limit
id: workload-name
run: |
TEST_NAME=$(echo "${{ matrix.test }}" | sed 's/_perf_mpi//g' | sed 's/_/-/g')
WORKLOAD_PREFIX="${{ env.WORKLOAD_NAME_PREFIX }}-${TEST_NAME}"
WORKLOAD_PREFIX="${WORKLOAD_PREFIX}-${TEST_NAME}"

echo "WORKLOAD_PREFIX=${WORKLOAD_PREFIX}" >> ${GITHUB_OUTPUT}

- name: Create NCCL test Services on cluster
run: |
SERVICE_MANIFEST=".github/gke-workflow/gke/nccl-svc-${WORKLOAD_NAME}-${{ matrix.test }}.yaml"
WORKLOAD_NAME="${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
SERVICE_MANIFEST=".github/gke-workflow/gke/nccl-svc-${WORKLOAD}-${{ matrix.test }}.yaml"
WORKLOAD="${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
echo "SERVICE_MANIFEST=${SERVICE_MANIFEST}" >> ${GITHUB_ENV}

cat .github/gke-workflow/gke/nccl-svc.yml | yq '.spec.selector."jobset.sigs.k8s.io/jobset-name" = "'${WORKLOAD_NAME}'"' --yaml-output | tee ${SERVICE_MANIFEST}
cat .github/gke-workflow/gke/nccl-svc.yml | yq '.spec.selector."jobset.sigs.k8s.io/jobset-name" = "'${WORKLOAD}'"' --yaml-output | tee ${SERVICE_MANIFEST}
kubectl apply -f ${SERVICE_MANIFEST}

- name: Run XPK workload on cluster
uses: ./.github/actions/gke-xpk
with:
IMAGE: ${{ env.BASE_IMAGE }}
IMAGE_PULL_SECRET_NAME: ${{ steps.store-token.outputs.token-name }}
WORKLOAD_NAME_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}
WORKLOAD_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NVCR_TOKEN: ${{ secrets.NVCR_TOKEN }}
ENVS: |
JAX_COORDINATOR_PORT=3389;
JAX_COORDINATOR_ADDRESS=\$(JOBSET_NAME)-\$(REPLICATED_JOB_NAME)-0-0.\$(JOBSET_NAME):\$(JAX_COORDINATOR_PORT);
NHOSTS=${{ env.NHOSTS }};
NCCL_LIB_DIR=/opt/nvida/nccl/lib;
SCRIPT_DIR=/scripts;
NCCL_MINBYTES=${{ env.NCCL_MINBYTES }};
NCCL_MAXBYTES=${{ env.NCCL_MAXBYTES }};
NCCL_STEPFACTOR=${{ env.NCCL_STEPFACTOR }};
NCCL_ITERS=${{ env.NCCL_ITERS }};
console=/dev/stdout;
COMMAND: |
service ssh restart;
console=/dev/stdout;

declare -a hosts=('nccl-test-host-1' 'nccl-test-host-2');

/scripts/nccl-test-launch.sh ${{ matrix.test }} \${hosts[@]} |&
Expand Down
Loading