Skip to content

Commit 0f88fa8

Browse files
authored
Merge pull request #2208 from nebius/SCHED-411/use-native-pyxis
SCHED-411: use native pyxis
2 parents 6bf4f65 + 03a66fd commit 0f88fa8

File tree

15 files changed

+122
-27
lines changed

15 files changed

+122
-27
lines changed

api/v1/slurmcluster_types.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ type SlurmClusterSpec struct {
111111
// PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`.
112112
//
113113
// +kubebuilder:validation:Optional
114-
// +kubebuilder:default={ pyxis: { required: true, containerImageSave: "/var/cache/enroot-container-images/" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } }
114+
// +kubebuilder:default={ pyxis: { required: true, importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } }
115115
PlugStackConfig PlugStackConfig `json:"plugStackConfig,omitempty"`
116116

117117
// SConfigController defines the desired state of controller that watches after configs
@@ -214,7 +214,7 @@ type PlugStackConfig struct {
214214
// Pyxis represents the 'Pyxis' SPANK plugin configuration.
215215
//
216216
// +kubebuilder:validation:Optional
217-
// +kubebuilder:default={ required: true, containerImageSave: "/var/cache/enroot-container-images/" }
217+
// +kubebuilder:default={ required: true, importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh" }
218218
Pyxis PluginConfigPyxis `json:"pyxis,omitempty"`
219219

220220
// NcclDebug represents the 'NCCL Debug' SPANK plugin configuration.
@@ -239,6 +239,14 @@ type PluginConfigPyxis struct {
239239
// +kubebuilder:default=true
240240
Required *bool `json:"required,omitempty"`
241241

242+
// Path to the executable for pyxis importer extension.
243+
// File should be available to execute for every user in Slurm.
244+
// More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
245+
//
246+
// +kubebuilder:validation:Optional
247+
// +kubebuilder:default="/opt/slurm_scripts/pyxis_caching_importer.sh"
248+
ImporterPath string `json:"importerPath,omitempty"`
249+
242250
// ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
243251
// If the specified file or directory already exists, it will be reused.
244252
// If the path does not exist, it will be created.
@@ -248,7 +256,7 @@ type PluginConfigPyxis struct {
248256
// If the option argument is empty (""), SquashFS files will not be stored.
249257
//
250258
// +kubebuilder:validation:Optional
251-
// +kubebuilder:default="/var/cache/enroot-container-images/"
259+
// +kubebuilder:deprecation:warning="The ContainerImageSave field is deprecated and will be removed in a future release"
252260
ContainerImageSave string `json:"containerImageSave,omitempty"`
253261
}
254262

config/crd/bases/slurm.nebius.ai_slurmclusters.yaml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,7 +1231,7 @@ spec:
12311231
outputToStdOut: false
12321232
required: false
12331233
pyxis:
1234-
containerImageSave: /var/cache/enroot-container-images/
1234+
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
12351235
required: true
12361236
description: PlugStackConfig represents the Plugin stack configurations
12371237
in `plugstack.conf`.
@@ -1322,12 +1322,11 @@ spec:
13221322
type: object
13231323
pyxis:
13241324
default:
1325-
containerImageSave: /var/cache/enroot-container-images/
1325+
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
13261326
required: true
13271327
description: Pyxis represents the 'Pyxis' SPANK plugin configuration.
13281328
properties:
13291329
containerImageSave:
1330-
default: /var/cache/enroot-container-images/
13311330
description: |-
13321331
ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
13331332
If the specified file or directory already exists, it will be reused.
@@ -1337,6 +1336,13 @@ spec:
13371336
If the image name contains '/', a nested directory will be created under the specified path (if it is a directory).
13381337
If the option argument is empty (""), SquashFS files will not be stored.
13391338
type: string
1339+
importerPath:
1340+
default: /opt/slurm_scripts/pyxis_caching_importer.sh
1341+
description: |-
1342+
Path to the executable for pyxis importer extension.
1343+
File should be available to execute for every user in Slurm.
1344+
More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
1345+
type: string
13401346
required:
13411347
default: true
13421348
description: |-
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
# Patched version of https://github.com/NVIDIA/pyxis/blob/v0.23.0/importers/caching_importer.sh
3+
4+
set -euo pipefail
5+
6+
readonly cmd="$1"
7+
8+
readonly cache_dir="${ENROOT_CONTAINER_IMAGES_CACHE_DIR:-/var/cache/enroot-container-images}"
9+
readonly squashfs_temp_path="${cache_dir}/${SLURM_JOB_ID}.${SLURM_STEP_ID}.sqsh"
10+
11+
# Since it's not an ephemeral squashfs file, we can use compression.
12+
export ENROOT_SQUASH_OPTIONS="-comp zstd -Xcompression-level 3 -b 1M"
13+
14+
case "${cmd}" in
15+
get)
16+
if [ $# -ne 2 ]; then
17+
echo "usage: $0 get URI" >&2
18+
exit 1
19+
fi
20+
21+
readonly image_uri="$2"
22+
23+
mkdir -p -m 700 "${cache_dir}"
24+
25+
readonly digest=$(enroot digest "${image_uri}")
26+
if [ -z "${digest}" ]; then
27+
echo "error: could not retrieve digest for image: ${image_uri}" >&2
28+
exit 1
29+
fi
30+
readonly squashfs_path="${cache_dir}/${digest}.sqsh"
31+
32+
if [ ! -e "${squashfs_path}" ]; then
33+
# TODO: use `digest` approach once 406 Not Acceptable is tolerated in enroot
34+
# https://github.com/NVIDIA/enroot/pull/263
35+
# if [[ "${image_uri}" == *"@${digest}" ]]; then
36+
# # URI already has the digest in it.
37+
# enroot import --output "${squashfs_temp_path}" "${image_uri}" >&2
38+
# else
39+
# # Add the digest to the URI.
40+
# enroot import --output "${squashfs_temp_path}" "${image_uri}@${digest}" >&2
41+
# fi
42+
enroot import --output "${squashfs_temp_path}" "${image_uri}" >&2
43+
44+
# Save the URI as an extended attribute.
45+
if command -v "setfattr" >/dev/null; then
46+
setfattr -n user.image_uri -v "${image_uri}" "${squashfs_temp_path}"
47+
fi
48+
49+
chmod 777 "${squashfs_temp_path}"
50+
mv -n "${squashfs_temp_path}" "${squashfs_path}"
51+
fi
52+
53+
# Output the squashfs path on stdout for pyxis to read
54+
echo "${squashfs_path}"
55+
;;
56+
release)
57+
if [ $# -ne 1 ]; then
58+
echo "usage: $0 release" >&2
59+
exit 1
60+
fi
61+
62+
# Remove temporary file if still present (e.g. "get" was interrupted)
63+
rm -f "${squashfs_temp_path}"
64+
;;
65+
*)
66+
echo "error: unknown command: ${cmd}" >&2
67+
exit 1
68+
;;
69+
esac

helm/slurm-cluster/templates/slurm-cluster-cr.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ spec:
9494
- name: slurm-scripts
9595
configMap:
9696
name: slurm-scripts
97-
defaultMode: 500
97+
defaultMode: 0755
9898
{{- range .Values.volumeSources }}
9999
- name: {{ .name | quote }}
100100
{{- omit . "name" "createPVC" "storageClassName" "size" | toYaml | nindent 6 }}

helm/slurm-cluster/templates/slurm-scripts-cm.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
release: {{ .Release.Name }}
99
data:
1010
# base scripts
11-
{{- range $name := list "check_runner.py" "prolog.sh" "epilog.sh" "hc_program.sh" }}
11+
{{- range $name := list "check_runner.py" "prolog.sh" "epilog.sh" "hc_program.sh" "pyxis_caching_importer.sh" }}
1212
{{- $content := index $.Values.slurmScripts $name }}
1313
{{ $name }}: |-
1414
{{- if $content }}

helm/slurm-cluster/tests/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ These tests verify the following kubebuilder default values:
5252

5353
### PlugStackConfig
5454
- `pyxis.required: true`
55-
- `pyxis.containerImageSave: "/var/cache/enroot-container-images/"`
55+
- `pyxis.importerPath: "/opt/slurm_scripts/pyxis_caching_importer.sh"`
5656
- `ncclDebug.required: false`
5757
- `ncclDebug.enabled: false`
5858
- `ncclDebug.logLevel: "INFO"`

helm/slurm-cluster/tests/default-values_test.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ tests:
7979
path: spec.plugStackConfig.pyxis.required
8080
value: true
8181
- equal:
82-
path: spec.plugStackConfig.pyxis.containerImageSave
83-
value: "/var/cache/enroot-container-images/"
82+
path: spec.plugStackConfig.pyxis.importerPath
83+
value: "/opt/slurm_scripts/pyxis_caching_importer.sh"
8484
- equal:
8585
path: spec.plugStackConfig.ncclDebug.required
8686
value: false

helm/slurm-cluster/values.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ mpiConfig:
158158
plugStackConfig:
159159
pyxis:
160160
required: true
161-
containerImageSave: "/var/cache/enroot-container-images/"
161+
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
162162
ncclDebug:
163163
required: false
164164
enabled: true
@@ -520,6 +520,7 @@ slurmScripts:
520520
prolog.sh: null
521521
epilog.sh: null
522522
hc_program.sh: null
523+
pyxis_caching_importer.sh: null
523524
# Built-in scripts can be overridden or disabled here
524525
# Keys must match file names in slurm_scripts/
525526
builtIn:

helm/soperator-crds/templates/slurmcluster-crd.yaml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27447,7 +27447,7 @@ spec:
2744727447
outputToStdOut: false
2744827448
required: false
2744927449
pyxis:
27450-
containerImageSave: /var/cache/enroot-container-images/
27450+
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
2745127451
required: true
2745227452
description: PlugStackConfig represents the Plugin stack configurations
2745327453
in `plugstack.conf`.
@@ -27538,12 +27538,11 @@ spec:
2753827538
type: object
2753927539
pyxis:
2754027540
default:
27541-
containerImageSave: /var/cache/enroot-container-images/
27541+
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
2754227542
required: true
2754327543
description: Pyxis represents the 'Pyxis' SPANK plugin configuration.
2754427544
properties:
2754527545
containerImageSave:
27546-
default: /var/cache/enroot-container-images/
2754727546
description: |-
2754827547
ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
2754927548
If the specified file or directory already exists, it will be reused.
@@ -27553,6 +27552,13 @@ spec:
2755327552
If the image name contains '/', a nested directory will be created under the specified path (if it is a directory).
2755427553
If the option argument is empty (""), SquashFS files will not be stored.
2755527554
type: string
27555+
importerPath:
27556+
default: /opt/slurm_scripts/pyxis_caching_importer.sh
27557+
description: |-
27558+
Path to the executable for pyxis importer extension.
27559+
File should be available to execute for every user in Slurm.
27560+
More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
27561+
type: string
2755627562
required:
2755727563
default: true
2755827564
description: |-

helm/soperator/crds/slurmcluster-crd.yaml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27447,7 +27447,7 @@ spec:
2744727447
outputToStdOut: false
2744827448
required: false
2744927449
pyxis:
27450-
containerImageSave: /var/cache/enroot-container-images/
27450+
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
2745127451
required: true
2745227452
description: PlugStackConfig represents the Plugin stack configurations
2745327453
in `plugstack.conf`.
@@ -27538,12 +27538,11 @@ spec:
2753827538
type: object
2753927539
pyxis:
2754027540
default:
27541-
containerImageSave: /var/cache/enroot-container-images/
27541+
importerPath: /opt/slurm_scripts/pyxis_caching_importer.sh
2754227542
required: true
2754327543
description: Pyxis represents the 'Pyxis' SPANK plugin configuration.
2754427544
properties:
2754527545
containerImageSave:
27546-
default: /var/cache/enroot-container-images/
2754727546
description: |-
2754827547
ContainerImageSave represents an absolute path to the file or directory where SquashFS files will be stored.
2754927548
If the specified file or directory already exists, it will be reused.
@@ -27553,6 +27552,13 @@ spec:
2755327552
If the image name contains '/', a nested directory will be created under the specified path (if it is a directory).
2755427553
If the option argument is empty (""), SquashFS files will not be stored.
2755527554
type: string
27555+
importerPath:
27556+
default: /opt/slurm_scripts/pyxis_caching_importer.sh
27557+
description: |-
27558+
Path to the executable for pyxis importer extension.
27559+
File should be available to execute for every user in Slurm.
27560+
More docs: https://github.com/NVIDIA/pyxis/tree/v0.23.0/importers.
27561+
type: string
2755627562
required:
2755727563
default: true
2755827564
description: |-

0 commit comments

Comments
 (0)