diff --git a/docs/examples/kai/kai.md b/docs/examples/kai/kai.md new file mode 100644 index 00000000..a2f2a277 --- /dev/null +++ b/docs/examples/kai/kai.md @@ -0,0 +1,10 @@ +## Example of running `KAI` with `knavigator` + +### Running workflows with `MPI job` + +Install [KAI scheduler](https://github.com/NVIDIA/KAI-Scheduler/blob/main/README.md) in your cluster. + +Run an MPI job: +```shell +./bin/knavigator -workflow resources/workflows/kai/test-mpijob.yaml +``` diff --git a/docs/getting_started.md b/docs/getting_started.md index 348e8e27..3810e027 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -27,3 +27,4 @@ We have tested several of these and offer templates and workflows to support the * [Kueue](./examples/kueue/kueue.md) * [YuniKorn](./examples/yunikorn/yunikorn.md) * [Run:ai](./examples/runai/runai.md) +* [Kai](./examples/kai/kai.md) diff --git a/resources/templates/kai/mpijob.yaml b/resources/templates/kai/mpijob.yaml new file mode 100644 index 00000000..21fa2451 --- /dev/null +++ b/resources/templates/kai/mpijob.yaml @@ -0,0 +1,62 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: "{{._NAME_}}" + namespace: "{{.namespace}}" + labels: + runai/queue: "{{.queue}}" +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + spec: + schedulerName: kai-scheduler + containers: + - image: {{.image}} + name: mpi-launcher + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + Worker: + replicas: {{.workers}} + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + labels: + app: {{._NAME_}} + spec: + schedulerName: kai-scheduler + containers: + - image: {{.image}} + name: mpi-worker + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" diff --git a/resources/templates/kai/queue.yaml b/resources/templates/kai/queue.yaml new file mode 100644 index 00000000..1ce70ceb --- /dev/null +++ b/resources/templates/kai/queue.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: scheduling.run.ai/v2 +kind: Queue +metadata: + name: "{{.name}}" +spec: + {{- if .parentQueue }} + parentQueue: "{{.parentQueue}}" + {{- end }} + resources: + cpu: + quota: -1 + limit: -1 + overQuotaWeight: 1 + gpu: + quota: -1 + limit: -1 + overQuotaWeight: 1 + memory: + quota: -1 + limit: -1 + overQuotaWeight: 1 diff --git a/resources/workflows/kai/test-mpijob.yaml b/resources/workflows/kai/test-mpijob.yaml new file mode 100644 index 00000000..80739384 --- /dev/null +++ b/resources/workflows/kai/test-mpijob.yaml @@ -0,0 +1,70 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: test-kai-mpijob +description: register, deploy and configure run:ai custom resources +tasks: +- id: register-queue + type: RegisterObj + params: + template: "resources/templates/kai/queue.yaml" +- id: register-mpijob + type: RegisterObj + params: + template: "resources/templates/kai/mpijob.yaml" + nameFormat: "mpijob{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" + podCount: "{{.workers}} + 1" +- id: configure + type: Configure + params: + nodes: + - type: dgxa100.80g + count: 3 + labels: + nvidia.com/gpu.count: "8" + timeout: 1m +- id: default-queue + type: SubmitObj + params: + refTaskId: register-queue + params: + name: default +- id: test-queue + type: SubmitObj + params: + refTaskId: register-queue + params: + name: test + parentQueue: default +- id: mpijob + type: SubmitObj + params: + refTaskId: register-mpijob + count: 1 + params: + namespace: default + queue: test + workers: 2 + image: ubuntu + cpu: 100m + memory: 250M + gpu: 8 + ttl: "20s" +- id: status + type: CheckPod + params: + refTaskId: mpijob + status: Running + timeout: 10s diff --git a/scripts/create-test-cluster.sh b/scripts/create-test-cluster.sh index b1d3ff01..d0f37131 100755 --- a/scripts/create-test-cluster.sh +++ b/scripts/create-test-cluster.sh @@ -33,10 +33,12 @@ if kind get clusters > /dev/null 2>&1; then read -p "> " choice if [[ "$choice" == "y" ]]; then kind delete cluster - kind create cluster --image=kindest/node:v1.29.7 + kind create cluster + # --image=kindest/node:v1.29.7 fi else - kind create cluster --image=kindest/node:v1.29.7 + kind create cluster + # --image=kindest/node:v1.29.7 fi deploy_prometheus @@ -52,7 +54,8 @@ cat << EOF 3: volcano (https://github.com/volcano-sh/volcano) 4: yunikorn (https://github.com/apache/yunikorn-core) 5: run:ai (https://www.run.ai) - 6: combined: coscheduler plugin + jobset + kueue + 6: kai (https://github.com/NVIDIA/KAI-Scheduler) + 7: combined: coscheduler plugin + jobset + kueue EOF read -p "> " choice @@ -73,6 +76,9 @@ case "$choice" in deploy_runai ;; 6) + deploy_kai + ;; + 7) deploy_scheduler_plugins deploy_jobset deploy_kueue diff --git a/scripts/env.sh b/scripts/env.sh index c3bb0042..3fec10ce 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -231,6 +231,27 @@ Run:ai deployment requires environment variables: --set-json 'affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' } +# https://github.com/NVIDIA/KAI-Scheduler/ +TRAINING_OPERATOR_VERSION=v1.8.0 +MPI_OPERATOR_VERSION=v0.4.0 +function deploy_kai() { + printGreen Deploying kai + + kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION" + + kubectl patch deployment training-operator -n kubeflow --type='json' \ + -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--enable-scheme=tfjob", "--enable-scheme=pytorchjob", "--enable-scheme=xgboostjob"]}]' + + kubectl delete crd mpijobs.kubeflow.org + + kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml + + helm repo add --force-update nvidia-k8s https://helm.ngc.nvidia.com/nvidia/k8s + helm repo update + helm upgrade --install kai-scheduler nvidia-k8s/kai-scheduler -n kai-scheduler \ + --create-namespace --wait --set "global.registry=nvcr.io/nvidia/k8s" +} + SCHEDULER_PLUGINS_VERSION=v0.29.7 function deploy_scheduler_plugins() { printGreen Deploying scheduler-plugins