File tree Expand file tree Collapse file tree 3 files changed +43
-3
lines changed Expand file tree Collapse file tree 3 files changed +43
-3
lines changed Original file line number Diff line number Diff line change @@ -70,7 +70,7 @@ inputs:
7070 XPK_VERSION :
7171 description : ' XPK release tag'
7272 required : false
73- default : ' v0.10.1 '
73+ default : ' v0.13.0 '
7474 type : string
7575 XPK_PYTHON :
7676 description : ' Python version for XPK'
@@ -118,7 +118,6 @@ runs:
118118 - name : Apply XPK workload create patch
119119 shell : bash -x -e -u {0}
120120 run : |
121- sed -i 's/{{ IMAGE_PULL_SECRET_NAME }}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch
122121 PATCH_PATH=.github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}
123122 ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD_NAME}/xpk
124123
@@ -171,7 +170,14 @@ runs:
171170 --scheduler=gke.io/topology-aware-auto
172171 )
173172
174- if [[ "${{ inputs.XPK_VERSION }}" == "v0.10.1" ]]; then
173+ version_greater() {
174+ if [[ $(echo -e "$1\n$2" | sort -V | head -n 1) != "$1" ]]; then
175+ return 0
176+ fi
177+ return 1
178+ }
179+
180+ if version_greater "${{ inputs.XPK_VERSION }}" "v0.10.0"; then
175181 args+=(
176182 --docker-image-pull-secret=${{ inputs.IMAGE_PULL_SECRET_NAME }}
177183 --env="JAX_COORDINATOR_PORT=3389"
Original file line number Diff line number Diff line change 1+ diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py
2+ index 3e432f7..c20e8e4 100644
3+ --- a/src/xpk/core/blueprint/blueprint_generator.py
4+ +++ b/src/xpk/core/blueprint/blueprint_generator.py
5+ @@ -201,9 +201,13 @@ class BlueprintGenerator:
6+ "type": "nvidia-h100-mega-80gb",
7+ "count": 8,
8+ "gpu_driver_installation_config": {
9+ - "gpu_driver_version": "LATEST"
10+ + # avoid using LATEST due to GPU driver forward compatibility support
11+ + # not available when using a feature branch driver release
12+ + "gpu_driver_version": "INSTALLATION_DISABLED"
13+ },
14+ }],
15+ + # use private nodes in pool due to NVIDIA organization constraints
16+ + "enable_private_nodes": True,
17+ "auto_upgrade": (
18+ True if capacity_type != CapacityType.FLEX_START else False
19+ ),
Original file line number Diff line number Diff line change 1+ diff --git a/src/xpk/core/workload_decorators/tcpxo_decorator.py b/src/xpk/core/workload_decorators/tcpxo_decorator.py
2+ index 3734f87..dc3b24a 100644
3+ --- a/src/xpk/core/workload_decorators/tcpxo_decorator.py
4+ +++ b/src/xpk/core/workload_decorators/tcpxo_decorator.py
5+ @@ -181,7 +181,9 @@ def update_gpu_containers(job_manifest):
6+ if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
7+ container.setdefault('env', [])
8+ container['env'].append(
9+ - {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
10+ + # set env. var. in user jobset container
11+ + # without prepend, host NCCL rather than container NCCL is used e.g. in NCCL tests
12+ + {'name': 'LD_LIBRARY_PATH', 'value': '/opt/nvidia/nccl/lib:/usr/local/nvidia/lib64'}
13+ )
14+ container['env'].append({
15+ 'name': 'NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY',
You can’t perform that action at this time.
0 commit comments