File tree Expand file tree Collapse file tree 11 files changed +289
-39
lines changed
gang-scheduling/workflows
benchmarks/gang-scheduling Expand file tree Collapse file tree 11 files changed +289
-39
lines changed Original file line number Diff line number Diff line change 1010
1111Install [ JobSet API] ( https://github.com/kubernetes-sigs/jobset ) in your cluster:
1212``` shell
13- kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml
13+ JOBSET_VERSION=v0.8.1
14+ kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION} /manifests.yaml
1415```
1516
1617Run a jobset with workers:
Original file line number Diff line number Diff line change 11## Example of running ` KAI ` with ` knavigator `
22
3- ### Running workflows with ` MPI job `
3+ ### Running workflows with ` MPI job ` and ` Job `
44
55Install [ KAI scheduler] ( https://github.com/NVIDIA/KAI-Scheduler/blob/main/README.md ) in your cluster.
66
7- Run an MPI job:
7+ Run an MPI job:
88``` shell
99./bin/knavigator -workflow resources/workflows/kai/test-mpijob.yaml
1010```
11+
12+ Run a multi-replica Job:
13+ ``` shell
14+ ./bin/knavigator -workflow resources/workflows/kai/test-job.yaml
15+ ```
Original file line number Diff line number Diff line change 33Install ` kueue ` by following these [ instructions] ( https://kueue.sigs.k8s.io/docs/installation/ ) :
44
55``` bash
6- KUEUE_VERSION=v0.9.0
6+ KUEUE_VERSION=v0.11.4
77kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION} /manifests.yaml
88
99kubectl apply -f charts/overrides/kueue/priority.yaml
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ name : test-kai-job
16+ description : register, deploy and configure kai custom resources
17+ tasks :
18+ - id : register-queue
19+ type : RegisterObj
20+ params :
21+ template : " resources/templates/kai/queue.yaml"
22+ - id : register
23+ type : RegisterObj
24+ params :
25+ template : " resources/benchmarks/templates/kai/job.yaml"
26+ nameFormat : " job{{._ENUM_}}"
27+ podNameFormat : " {{._NAME_}}-[a-z0-9]+"
28+ podCount : " {{.replicas}}"
29+ - id : default-queue
30+ type : SubmitObj
31+ params :
32+ refTaskId : register-queue
33+ canExist : true
34+ params :
35+ name : default
36+ - id : test-queue
37+ type : SubmitObj
38+ params :
39+ refTaskId : register-queue
40+ canExist : true
41+ params :
42+ name : test
43+ parentQueue : default
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : batch/v1
16+ kind : Job
17+ metadata :
18+ name : " {{._NAME_}}"
19+ namespace : " default"
20+ spec :
21+ completions : {{.replicas}}
22+ parallelism : {{.replicas}}
23+ template :
24+ metadata :
25+ labels :
26+ runai/queue : " test"
27+ annotations :
28+ pod-complete.stage.kwok.x-k8s.io/delay : {{.ttl}}
29+ pod-complete.stage.kwok.x-k8s.io/jitter-delay : {{.ttl}}
30+ spec :
31+ schedulerName : kai-scheduler
32+ containers :
33+ - name : test
34+ image : busybox
35+ imagePullPolicy : IfNotPresent
36+ resources :
37+ limits :
38+ cpu : 100m
39+ memory : 250M
40+ nvidia.com/gpu : " 8"
41+ requests :
42+ cpu : 100m
43+ memory : 250M
44+ nvidia.com/gpu : " 8"
45+ restartPolicy : Never
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : scheduling.run.ai/v2
16+ kind : Queue
17+ metadata :
18+ name : " {{.name}}"
19+ spec :
20+ {{- if .parentQueue }}
21+ parentQueue : " {{.parentQueue}}"
22+ {{- end }}
23+ resources :
24+ cpu :
25+ quota : -1
26+ limit : -1
27+ overQuotaWeight : 1
28+ gpu :
29+ quota : -1
30+ limit : -1
31+ overQuotaWeight : 1
32+ memory :
33+ quota : -1
34+ limit : -1
35+ overQuotaWeight : 1
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : batch/v1
16+ kind : Job
17+ metadata :
18+ name : " {{._NAME_}}"
19+ namespace : " {{.namespace}}"
20+ spec :
21+ completions : {{.replicas}}
22+ parallelism : {{.replicas}}
23+ template :
24+ metadata :
25+ labels :
26+ runai/queue : " {{.queue}}"
27+ annotations :
28+ pod-complete.stage.kwok.x-k8s.io/delay : {{.ttl}}
29+ pod-complete.stage.kwok.x-k8s.io/jitter-delay : {{.ttl}}
30+ spec :
31+ schedulerName : kai-scheduler
32+ containers :
33+ - name : test
34+ image : {{.image}}
35+ imagePullPolicy : IfNotPresent
36+ resources :
37+ limits :
38+ cpu : " {{.cpu}}"
39+ memory : {{.memory}}
40+ nvidia.com/gpu : " {{.gpu}}"
41+ requests :
42+ cpu : " {{.cpu}}"
43+ memory : {{.memory}}
44+ nvidia.com/gpu : " {{.gpu}}"
45+ restartPolicy : Never
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ name : test-kai-job
16+ description : submit and validate a job with kai scheduler
17+ tasks :
18+ - id : register-queue
19+ type : RegisterObj
20+ params :
21+ template : " resources/templates/kai/queue.yaml"
22+ - id : register-job
23+ type : RegisterObj
24+ params :
25+ template : " resources/templates/kai/job.yaml"
26+ nameFormat : " job{{._ENUM_}}"
27+ podNameFormat : " {{._NAME_}}-[a-z0-9]+"
28+ podCount : " {{.replicas}}"
29+ - id : configure
30+ type : Configure
31+ params :
32+ nodes :
33+ - type : dgxa100.80g
34+ count : 3
35+ labels :
36+ nvidia.com/gpu.count : " 8"
37+ timeout : 1m
38+ - id : default-queue
39+ type : SubmitObj
40+ params :
41+ refTaskId : register-queue
42+ canExist : true
43+ params :
44+ name : default
45+ - id : test-queue
46+ type : SubmitObj
47+ params :
48+ refTaskId : register-queue
49+ canExist : true
50+ params :
51+ name : test
52+ parentQueue : default
53+ - id : job
54+ type : SubmitObj
55+ params :
56+ refTaskId : register-job
57+ count : 1
58+ params :
59+ namespace : default
60+ queue : test
61+ replicas : 3
62+ image : ubuntu
63+ cpu : 100m
64+ memory : 250M
65+ gpu : 8
66+ ttl : " 20s"
67+ - id : status
68+ type : CheckPod
69+ params :
70+ refTaskId : job
71+ status : Running
72+ timeout : 10s
Original file line number Diff line number Diff line change 1313# limitations under the License.
1414
1515name : test-kai-mpijob
16- description : register, deploy and configure run:ai custom resources
16+ description : submit and validate an mpijob with kai scheduler
1717tasks :
1818- id : register-queue
1919 type : RegisterObj
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+
3+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
4+ #
5+ # Licensed under the Apache License, Version 2.0 (the "License");
6+ # you may not use this file except in compliance with the License.
7+ # You may obtain a copy of the License at
8+ #
9+ # http://www.apache.org/licenses/LICENSE-2.0
10+ #
11+ # Unless required by applicable law or agreed to in writing, software
12+ # distributed under the License is distributed on an "AS IS" BASIS,
13+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ # See the License for the specific language governing permissions and
15+ # limitations under the License.
16+
17+ set -e
18+
19+ REPO_HOME=$( readlink -f $( dirname $( readlink -f " $0 " ) ) /../../../)
20+
21+ $REPO_HOME /bin/knavigator -workflow " $REPO_HOME /resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kai.yaml,run-test.yaml}"
You can’t perform that action at this time.
0 commit comments