Skip to content

Commit b5ad330

Browse files
authored
Upgrade xpk from v0.10.1 to v0.13.0 (#1742)
- [x] Upgrade `xpk` from `v0.10.1` to `v0.13.0`
1 parent c3de4f1 commit b5ad330

File tree

3 files changed

+43
-3
lines changed

3 files changed

+43
-3
lines changed

.github/actions/gke-xpk/action.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ inputs:
7070
XPK_VERSION:
7171
description: 'XPK release tag'
7272
required: false
73-
default: 'v0.10.1'
73+
default: 'v0.13.0'
7474
type: string
7575
XPK_PYTHON:
7676
description: 'Python version for XPK'
@@ -118,7 +118,6 @@ runs:
118118
- name: Apply XPK workload create patch
119119
shell: bash -x -e -u {0}
120120
run: |
121-
sed -i 's/{{ IMAGE_PULL_SECRET_NAME }}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch
122121
PATCH_PATH=.github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}
123122
ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD_NAME}/xpk
124123
@@ -171,7 +170,14 @@ runs:
171170
--scheduler=gke.io/topology-aware-auto
172171
)
173172
174-
if [[ "${{ inputs.XPK_VERSION }}" == "v0.10.1" ]]; then
173+
version_greater() {
174+
if [[ $(echo -e "$1\n$2" | sort -V | head -n 1) != "$1" ]]; then
175+
return 0
176+
fi
177+
return 1
178+
}
179+
180+
if version_greater "${{ inputs.XPK_VERSION }}" "v0.10.0"; then
175181
args+=(
176182
--docker-image-pull-secret=${{ inputs.IMAGE_PULL_SECRET_NAME }}
177183
--env="JAX_COORDINATOR_PORT=3389"
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py
2+
index 3e432f7..c20e8e4 100644
3+
--- a/src/xpk/core/blueprint/blueprint_generator.py
4+
+++ b/src/xpk/core/blueprint/blueprint_generator.py
5+
@@ -201,9 +201,13 @@ class BlueprintGenerator:
6+
"type": "nvidia-h100-mega-80gb",
7+
"count": 8,
8+
"gpu_driver_installation_config": {
9+
- "gpu_driver_version": "LATEST"
10+
+ # avoid using LATEST due to GPU driver forward compatibility support
11+
+ # not available when using a feature branch driver release
12+
+ "gpu_driver_version": "INSTALLATION_DISABLED"
13+
},
14+
}],
15+
+ # use private nodes in pool due to NVIDIA organization constraints
16+
+ "enable_private_nodes": True,
17+
"auto_upgrade": (
18+
True if capacity_type != CapacityType.FLEX_START else False
19+
),
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
diff --git a/src/xpk/core/workload_decorators/tcpxo_decorator.py b/src/xpk/core/workload_decorators/tcpxo_decorator.py
2+
index 3734f87..dc3b24a 100644
3+
--- a/src/xpk/core/workload_decorators/tcpxo_decorator.py
4+
+++ b/src/xpk/core/workload_decorators/tcpxo_decorator.py
5+
@@ -181,7 +181,9 @@ def update_gpu_containers(job_manifest):
6+
if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
7+
container.setdefault('env', [])
8+
container['env'].append(
9+
- {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
10+
+ # set env. var. in user jobset container
11+
+ # without prepend, host NCCL rather than container NCCL is used e.g. in NCCL tests
12+
+ {'name': 'LD_LIBRARY_PATH', 'value': '/opt/nvidia/nccl/lib:/usr/local/nvidia/lib64'}
13+
)
14+
container['env'].append({
15+
'name': 'NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY',

0 commit comments

Comments
 (0)