diff --git a/docs/examples/kueue/kueue.md b/docs/examples/kueue/kueue.md index 0e991101..5cef4c1f 100644 --- a/docs/examples/kueue/kueue.md +++ b/docs/examples/kueue/kueue.md @@ -9,12 +9,17 @@ kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases kubectl apply -f charts/overrides/kueue/priority.yaml ``` -Run a kueue job: +Run a Job with kueue: ```bash ./bin/knavigator -workflow resources/workflows/kueue/test-job.yaml -cleanup ``` -Run a preemption workflow with kueue: +Run a preemption workflow with kueue: ```bash ./bin/knavigator -workflow resources/workflows/kueue/test-preemption.yaml -cleanup ``` + +Run a RayJob with kueue: +```bash +./bin/knavigator -workflow resources/workflows/kueue/test-rayjob.yaml -cleanup +``` diff --git a/resources/benchmarks/README.md b/resources/benchmarks/README.md index 25c66b9b..2e06df62 100644 --- a/resources/benchmarks/README.md +++ b/resources/benchmarks/README.md @@ -5,6 +5,7 @@ This directory contains benchmark tests for the following workload managers and - Kueue - Volcano - Yunikorn +- Kai - Run:ai The benchmark tests involve submitting workloads intended to evaluate the scheduler's performance under specific scenarios. diff --git a/resources/benchmarks/gang-scheduling/workflows/config-kai.yaml b/resources/benchmarks/gang-scheduling/workflows/config-kai.yaml index 0a164b8b..84b35cbc 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-kai.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/config-kai.yaml @@ -26,6 +26,13 @@ tasks: nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-[a-z0-9]+" podCount: "{{.replicas}}" +- id: register-lw + type: RegisterObj + params: + template: "resources/benchmarks/templates/kai/mpijob.yaml" + nameFormat: "job{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" + podCount: "{{.workers}} + 1" - id: default-queue type: SubmitObj params: diff --git a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml index 69f6f11b..1bcde33f 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml @@ -34,6 +34,20 @@ tasks: nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-[0-9]-.*" podCount: "{{.replicas}}" +#- id: register-lw +# type: RegisterObj +# params: +# template: "resources/benchmarks/templates/kueue/rayjob.yaml" +# nameFormat: "job{{._ENUM_}}" +# podNameFormat: "{{._NAME_}}-raycluster-.*" +# podCount: "{{.workers}} + 1" +- id: register-lw + type: RegisterObj + params: + template: "resources/benchmarks/templates/kueue/mpijob.yaml" + nameFormat: "job{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" + podCount: "{{.workers}} + 1" - id: create-resource-flavor type: SubmitObj params: diff --git a/resources/benchmarks/gang-scheduling/workflows/config-runai.yaml b/resources/benchmarks/gang-scheduling/workflows/config-runai.yaml new file mode 100644 index 00000000..1ba65c8f --- /dev/null +++ b/resources/benchmarks/gang-scheduling/workflows/config-runai.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: config-runai +tasks: +- id: register + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/trainingworkload.yaml" + nameFormat: "twl{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-0-0" + podCount: 1 +- id: register-lw + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/distributedworkload.yaml" + nameFormat: "dwl{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" + podCount: "{{.workers}} + 1" diff --git a/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml b/resources/benchmarks/gang-scheduling/workflows/run-test-lw.yaml similarity index 62% rename from resources/benchmarks/gang-scheduling/workflows/runai-test.yaml rename to resources/benchmarks/gang-scheduling/workflows/run-test-lw.yaml index efdb2095..c461e0c1 100644 --- a/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml +++ b/resources/benchmarks/gang-scheduling/workflows/run-test-lw.yaml @@ -12,29 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: test-gang-scheduling-runai +name: test-gang-scheduling-lw tasks: -- id: register-trainingworkload - type: RegisterObj - params: - template: "resources/benchmarks/templates/runai/trainingworkload.yaml" - nameFormat: "twl{{._ENUM_}}" - podNameFormat: "{{._NAME_}}-0-0" - podCount: 1 -- id: register-distributedworkload - type: RegisterObj - params: - template: "resources/benchmarks/templates/runai/distributedworkload.yaml" - nameFormat: "dwl{{._ENUM_}}" - podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" - podCount: "{{.workers}} + 1" -# -### Benchmark test -# - id: job1 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 1 params: workers: 31 @@ -42,7 +25,7 @@ tasks: - id: job2 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 2 params: workers: 15 @@ -50,7 +33,7 @@ tasks: - id: job3 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 3 params: workers: 9 @@ -58,7 +41,7 @@ tasks: - id: job3.1 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 1 params: workers: 1 @@ -66,7 +49,7 @@ tasks: - id: job4 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 4 params: workers: 7 @@ -74,7 +57,7 @@ tasks: - id: job5 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 5 params: workers: 5 @@ -82,14 +65,15 @@ tasks: - id: job5.1 type: SubmitObj params: - refTaskId: register-trainingworkload + refTaskId: register count: 2 params: + replicas: 1 ttl: 2m - id: job6 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 6 params: workers: 4 @@ -97,7 +81,7 @@ tasks: - id: job6.1 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 1 params: workers: 1 @@ -105,7 +89,7 @@ tasks: - id: job7 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 7 params: workers: 3 @@ -113,7 +97,7 @@ tasks: - id: job7.1 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 1 params: workers: 1 @@ -121,14 +105,15 @@ tasks: - id: job7.2 type: SubmitObj params: - refTaskId: register-trainingworkload + refTaskId: register count: 2 params: + replicas: 1 ttl: 2m - id: job8 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 8 params: workers: 3 @@ -136,7 +121,7 @@ tasks: - id: job9 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 9 params: workers: 2 @@ -144,7 +129,7 @@ tasks: - id: job9.1 type: SubmitObj params: - refTaskId: register-distributedworkload + refTaskId: register-lw count: 1 params: workers: 4 diff --git a/resources/benchmarks/templates/kai/job.yaml b/resources/benchmarks/templates/kai/job.yaml index 49de7dc4..e969e410 100644 --- a/resources/benchmarks/templates/kai/job.yaml +++ b/resources/benchmarks/templates/kai/job.yaml @@ -1,17 +1,3 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - apiVersion: batch/v1 kind: Job metadata: diff --git a/resources/benchmarks/templates/kai/mpijob.yaml b/resources/benchmarks/templates/kai/mpijob.yaml new file mode 100644 index 00000000..31ebee63 --- /dev/null +++ b/resources/benchmarks/templates/kai/mpijob.yaml @@ -0,0 +1,48 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: "{{._NAME_}}" + namespace: "default" + labels: + runai/queue: "test" +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + spec: + schedulerName: kai-scheduler + containers: + - image: busybox + name: mpi-launcher + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" + Worker: + replicas: {{.workers}} + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + labels: + app: {{._NAME_}} + spec: + schedulerName: kai-scheduler + containers: + - image: busybox + name: mpi-worker + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" diff --git a/resources/benchmarks/templates/kai/queue.yaml b/resources/benchmarks/templates/kai/queue.yaml index 1ce70ceb..2c732cf0 100644 --- a/resources/benchmarks/templates/kai/queue.yaml +++ b/resources/benchmarks/templates/kai/queue.yaml @@ -1,17 +1,3 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - apiVersion: scheduling.run.ai/v2 kind: Queue metadata: diff --git a/resources/benchmarks/templates/kueue/job.yaml b/resources/benchmarks/templates/kueue/job.yaml index 011d8296..e1d6230b 100644 --- a/resources/benchmarks/templates/kueue/job.yaml +++ b/resources/benchmarks/templates/kueue/job.yaml @@ -1,17 +1,3 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - apiVersion: batch/v1 kind: Job metadata: diff --git a/resources/benchmarks/templates/kueue/mpijob.yaml b/resources/benchmarks/templates/kueue/mpijob.yaml new file mode 100644 index 00000000..13c8904c --- /dev/null +++ b/resources/benchmarks/templates/kueue/mpijob.yaml @@ -0,0 +1,48 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: "{{._NAME_}}" + namespace: default + labels: + kueue.x-k8s.io/queue-name: team-queue + annotations: + kueue.x-k8s.io/job-group: group-{{._NAME_}} +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + spec: + containers: + - image: busybox + name: mpi-launcher + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" + Worker: + replicas: {{.workers}} + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + labels: + app: {{._NAME_}} + spec: + containers: + - image: busybox + name: mpi-worker + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: "8" diff --git a/resources/benchmarks/templates/kueue/rayjob.yaml b/resources/benchmarks/templates/kueue/rayjob.yaml new file mode 100644 index 00000000..30ccd21b --- /dev/null +++ b/resources/benchmarks/templates/kueue/rayjob.yaml @@ -0,0 +1,68 @@ +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: "{{._NAME_}}" + namespace: default + labels: + labels: + kueue.x-k8s.io/queue-name: team-queue + annotations: + kueue.x-k8s.io/job-group: group-{{._NAME_}} +spec: + entrypoint: python /home/ray/samples/sample_code.py + shutdownAfterJobFinishes: true + runtimeEnvYAML: | + pip: + - requests==2.26.0 + env_vars: + EXAMPLE_VAR: "value" + rayClusterSpec: + rayVersion: '2.9.0' + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + metadata: + annotations: + {{- if .ttl }} + pod-complete.stage.kwok.x-k8s.io/delay: "{{.ttl}}" + pod-complete.stage.kwok.x-k8s.io/jitter-delay: "{{.ttl}}" + {{- end }} + spec: + containers: + - name: ray-head + image: busybox + resources: + limits: + cpu: 100m + memory: 256M + nvidia.com/gpu: "8" + requests: + cpu: 100m + memory: 256M + nvidia.com/gpu: "8" + workerGroupSpecs: + - replicas: {{.workers}} + rayStartParams: + dashboard-host: '0.0.0.0' + groupName: test-group + template: + metadata: + annotations: + {{- if .ttl }} + pod-complete.stage.kwok.x-k8s.io/delay: "{{.ttl}}" + pod-complete.stage.kwok.x-k8s.io/jitter-delay: "{{.ttl}}" + {{- end }} + spec: + containers: + - name: ray-worker + image: busybox + resources: + limits: + cpu: 100m + memory: 256M + nvidia.com/gpu: "8" + requests: + cpu: 100m + memory: 256M + nvidia.com/gpu: "8" diff --git a/resources/templates/kueue/rayjob.yaml b/resources/templates/kueue/rayjob.yaml new file mode 100644 index 00000000..43b58db5 --- /dev/null +++ b/resources/templates/kueue/rayjob.yaml @@ -0,0 +1,56 @@ +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: "{{._NAME_}}" + namespace: {{.namespace}} + labels: + kueue.x-k8s.io/queue-name: {{.queueName}} + {{- if .priority }} + kueue.x-k8s.io/priority-class: {{.priority}} + {{- end }} +spec: + entrypoint: python /home/ray/samples/sample_code.py + shutdownAfterJobFinishes: true + runtimeEnvYAML: | + pip: + - requests==2.26.0 + env_vars: + EXAMPLE_VAR: "value" + rayClusterSpec: + rayVersion: '2.9.0' + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: {{.image}} + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + requests: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + workerGroupSpecs: + - replicas: {{.workers}} + rayStartParams: + dashboard-host: '0.0.0.0' + groupName: test-group + template: + spec: + containers: + - name: ray-worker + image: {{.image}} + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + requests: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" diff --git a/resources/workflows/kueue/test-rayjob.yaml b/resources/workflows/kueue/test-rayjob.yaml new file mode 100644 index 00000000..fbf3212f --- /dev/null +++ b/resources/workflows/kueue/test-rayjob.yaml @@ -0,0 +1,103 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: test-kueue-rayjob +description: submit and validate rayjob with kueue +tasks: +- id: register-cluster-queue + type: RegisterObj + params: + template: "resources/templates/kueue/cluster-queue.yaml" +- id: register-local-queue + type: RegisterObj + params: + template: "resources/templates/kueue/local-queue.yaml" +- id: register-resource-flavor + type: RegisterObj + params: + template: "resources/templates/kueue/resource-flavor.yaml" +- id: register-job + type: RegisterObj + params: + template: "resources/templates/kueue/rayjob.yaml" + nameFormat: "job{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-raycluster-.*" + podCount: "{{.workers}} + 1" +- id: configure + type: Configure + params: + nodes: + - type: dgxa100.80g + count: 4 + labels: + nvidia.com/gpu.count: "8" + timeout: 1m +- id: create-resource-flavor + type: SubmitObj + params: + refTaskId: register-resource-flavor + canExist: true + params: + name: "gpu-node" + nodeLabels: + nvidia.com/gpu.count: "8" +- id: create-cluster-queue + type: SubmitObj + params: + refTaskId: register-cluster-queue + canExist: true + params: + name: team + flavor: gpu-node + cpu: 8 + memory: 36Gi + pods: 4 + gpu: 32 +- id: create-local-queue + type: SubmitObj + params: + refTaskId: register-local-queue + canExist: true + params: + name: team-queue + namespace: default + clusterQueue: team +- id: job + type: SubmitObj + params: + refTaskId: register-job + count: 1 + params: + queueName: team-queue + namespace: default + workers: 2 + image: ubuntu + cpu: 100m + memory: 512M + gpu: 8 + ttl: "20s" +#- id: status +# type: CheckObj +# params: +# refTaskId: job +# state: +# status: +# active: 4 +# timeout: 10s +- id: status + type: CheckPod + params: + refTaskId: job + status: Running + timeout: 10s diff --git a/scripts/benchmarks/gang-scheduling/run-kai.sh b/scripts/benchmarks/gang-scheduling/run-kai.sh index 4423a420..03b4f9de 100755 --- a/scripts/benchmarks/gang-scheduling/run-kai.sh +++ b/scripts/benchmarks/gang-scheduling/run-kai.sh @@ -18,4 +18,4 @@ set -e REPO_HOME=$(readlink -f $(dirname $(readlink -f "$0"))/../../../) -$REPO_HOME/bin/knavigator -workflow "$REPO_HOME/resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kai.yaml,run-test.yaml}" +$REPO_HOME/bin/knavigator -workflow "$REPO_HOME/resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kai.yaml,run-test-lw.yaml}" diff --git a/scripts/benchmarks/gang-scheduling/run-kueue.sh b/scripts/benchmarks/gang-scheduling/run-kueue.sh index 17530381..174d85b2 100755 --- a/scripts/benchmarks/gang-scheduling/run-kueue.sh +++ b/scripts/benchmarks/gang-scheduling/run-kueue.sh @@ -18,4 +18,4 @@ set -e REPO_HOME=$(readlink -f $(dirname $(readlink -f "$0"))/../../../) -$REPO_HOME/bin/knavigator -workflow "$REPO_HOME/resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kueue.yaml,run-test.yaml}" +$REPO_HOME/bin/knavigator -workflow "$REPO_HOME/resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kueue.yaml,run-test-lw.yaml}" diff --git a/scripts/benchmarks/gang-scheduling/run-runai.sh b/scripts/benchmarks/gang-scheduling/run-runai.sh index 2fe387e8..381a3e28 100755 --- a/scripts/benchmarks/gang-scheduling/run-runai.sh +++ b/scripts/benchmarks/gang-scheduling/run-runai.sh @@ -18,4 +18,4 @@ set -e REPO_HOME=$(readlink -f $(dirname $(readlink -f "$0"))/../../../) -$REPO_HOME/bin/knavigator -workflow "$REPO_HOME/resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,runai-test.yaml}" +$REPO_HOME/bin/knavigator -workflow "$REPO_HOME/resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-runai.yaml,runai-test-lw.yaml}" diff --git a/scripts/env.sh b/scripts/env.sh index c631f956..94eded02 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -70,9 +70,9 @@ function wait_for_pods() { # KWOK # function deploy_kwok() { - printGreen Deploying KWOK KWOK_REPO=kubernetes-sigs/kwok KWOK_RELEASE="v0.6.1" + printGreen Deploying KWOK $KWOK_RELEASE # Deploy KWOK controller kubectl apply -f https://github.com/${KWOK_REPO}/releases/download/${KWOK_RELEASE}/kwok.yaml @@ -87,8 +87,8 @@ function deploy_kwok() { # Prometheus # function deploy_prometheus() { - printGreen Deploying Prometheus PROMETHEUS_STACK_VERSION=61.5.0 + printGreen Deploying Prometheus stack $PROMETHEUS_STACK_VERSION helm repo add --force-update prometheus-community https://prometheus-community.github.io/helm-charts @@ -118,8 +118,8 @@ function deploy_prometheus() { # https://github.com/kubernetes-sigs/jobset function deploy_jobset() { - printGreen Deploying jobset JOBSET_VERSION=v0.8.1 + printGreen Deploying jobset $JOBSET_VERSION kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml @@ -133,8 +133,11 @@ function deploy_jobset() { # https://github.com/kubernetes-sigs/kueue function deploy_kueue() { - printGreen Deploying kueue + deploy_mpi_operator + deploy_kuberay_operator + KUEUE_VERSION=v0.11.4 + printGreen Deploying kueue $KUEUE_VERSION kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml @@ -148,8 +151,8 @@ function deploy_kueue() { # https://github.com/volcano-sh/volcano function deploy_volcano() { - printGreen Deploying volcano VOLCANO_VERSION=v1.11.2 + printGreen Deploying volcano $VOLCANO_VERSION helm repo add --force-update volcano-sh https://volcano-sh.github.io/helm-charts @@ -168,8 +171,8 @@ function deploy_volcano() { # https://github.com/apache/yunikorn-core function deploy_yunikorn() { - printGreen Deploying yunikorn YUNIKORN_VERSION=v1.6.2 + printGreen Deploying yunikorn $YUNIKORN_VERSION helm repo add --force-update yunikorn https://apache.github.io/yunikorn-release @@ -224,17 +227,15 @@ Run:ai deployment requires environment variables: # https://github.com/NVIDIA/KAI-Scheduler/ function deploy_kai() { - printGreen Deploying kai - MPI_OPERATOR_VERSION=v0.6.0 - KAI_VERSION=v0.4.7 + deploy_mpi_operator - kubectl apply --server-side -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml + KAI_VERSION=v0.4.7 + printGreen Deploying kai scheduler $KAI_VERSION helm upgrade --install kai-scheduler oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler -n kai-scheduler \ --version="$KAI_VERSION" --create-namespace --wait } - function deploy_scheduler_plugins() { printGreen Deploying scheduler-plugins SCHEDULER_PLUGINS_VERSION=v0.29.7 @@ -244,3 +245,19 @@ function deploy_scheduler_plugins() { --set-json 'scheduler.affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' \ --set-json 'controller.affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' } + +function deploy_mpi_operator() { + MPI_OPERATOR_VERSION=v0.6.0 + printGreen Deploying mpi-operator $MPI_OPERATOR_VERSION + kubectl apply --server-side -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml +} + +function deploy_kuberay_operator() { + RAY_VERSION=v1.3.2 + printGreen Deploying kuberay-operator $RAY_VERSION + + helm repo add kuberay https://ray-project.github.io/kuberay-helm/ + helm repo update + helm install kuberay-operator kuberay/kuberay-operator -n kuberay-system --create-namespace \ + --version=$RAY_VERSION --wait +}