Skip to content

Commit d348c29

Browse files
authored
Merge branch 'main' into fix/sharing-volume-dots
2 parents 536313d + d226f61 commit d348c29

25 files changed

Lines changed: 1310 additions & 45 deletions

deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,43 @@ spec:
142142
resourcetype=1500, podaffinity=1400, elastic=1300, kubeflow=1200,
143143
ray=1100, subgrouporder=1000, taskorder=900, nominatednode=800,
144144
dynamicresources=700, minruntime=600, topology=500, snapshot=400,
145-
gpupack/gpuspread=300, nodeplacement=200, gpusharingorder=100.
145+
sg-nodelocalgreedy=360, sg-multinodegang=350, gpupack/gpuspread=300,
146+
nodeplacement=200, gpusharingorder=100.
146147
type: object
147148
queueDepthPerAction:
148149
additionalProperties:
149150
type: integer
150151
description: QueueDepthPerAction max number of jobs to try for action
151152
per queue
152153
type: object
154+
scenarioSearchBudgets:
155+
description: ScenarioSearchBudgets configures alpha/experimental time
156+
budgets for scenario search.
157+
properties:
158+
maxActionSearchDuration:
159+
additionalProperties:
160+
type: string
161+
description: |-
162+
MaxActionSearchDuration limits total scenario search time per scheduler action.
163+
Keys are action names, with "default" used as the fallback budget.
164+
type: object
165+
maxGeneratorSearchDuration:
166+
additionalProperties:
167+
type: string
168+
description: |-
169+
MaxGeneratorSearchDuration limits scenario search time per generator attempt.
170+
Keys are generator names, with "default" used as the fallback budget.
171+
type: object
172+
maxJobSearchDuration:
173+
description: MaxJobSearchDuration limits total scenario search
174+
time per pending job.
175+
type: string
176+
minJobSearchDuration:
177+
description: |-
178+
MinJobSearchDuration guarantees each pending job this much scenario search time
179+
before action and generator budgets can stop the job's search.
180+
type: string
181+
type: object
153182
usageDBConfig:
154183
description: UsageDBConfig defines configuration for the usage db
155184
client

deployments/kai-scheduler/templates/default-shard.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,8 @@ spec:
2727
actions:
2828
{{- toYaml .Values.scheduler.actions | nindent 4 }}
2929
{{- end }}
30+
{{- if .Values.scheduler.scenarioSearchBudgets }}
31+
scenarioSearchBudgets:
32+
{{- toYaml .Values.scheduler.scenarioSearchBudgets | nindent 4 }}
33+
{{- end }}
3034
{{- end }}

deployments/kai-scheduler/tests/default_shard_test.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,27 @@ tests:
3131
asserts:
3232
- hasDocuments:
3333
count: 0
34+
35+
- it: should render scenario search budgets
36+
set:
37+
scheduler:
38+
scenarioSearchBudgets:
39+
maxActionSearchDuration:
40+
reclaim: "2s"
41+
maxJobSearchDuration: "250ms"
42+
minJobSearchDuration: "0s"
43+
maxGeneratorSearchDuration:
44+
NodeLocalGreedy: "50ms"
45+
asserts:
46+
- equal:
47+
path: spec.scenarioSearchBudgets.maxActionSearchDuration.reclaim
48+
value: 2s
49+
- equal:
50+
path: spec.scenarioSearchBudgets.maxJobSearchDuration
51+
value: 250ms
52+
- equal:
53+
path: spec.scenarioSearchBudgets.minJobSearchDuration
54+
value: 0s
55+
- equal:
56+
path: spec.scenarioSearchBudgets.maxGeneratorSearchDuration.NodeLocalGreedy
57+
value: 50ms

deployments/kai-scheduler/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ scheduler:
149149
# mycustomaction:
150150
# priority: 150
151151
actions: {}
152+
# Alpha/experimental scenario search time budgets. Values are Go duration strings.
153+
scenarioSearchBudgets: {}
152154

153155
# defaultShard controls the chart-managed "default" SchedulingShard CR,
154156
# whose spec is populated from the scheduler.* values above.

docs/developer/designs/reclaim-generator-portfolio-design.md

Lines changed: 387 additions & 0 deletions
Large diffs are not rendered by default.

pkg/apis/kai/v1/schedulingshard_types.go

Lines changed: 87 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ package v1
1818

1919
import (
2020
"strconv"
21+
"time"
2122

2223
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2324
"k8s.io/utils/ptr"
2425

2526
"github.com/kai-scheduler/KAI-scheduler/pkg/apis/kai/v1/common"
27+
"github.com/kai-scheduler/KAI-scheduler/pkg/common/constants"
2628
usagedbapi "github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
2729
)
2830

@@ -60,6 +62,23 @@ type ActionConfig struct {
6062
Priority *int `json:"priority,omitempty"`
6163
}
6264

65+
type ScenarioSearchBudgets struct {
66+
// MaxActionSearchDuration limits total scenario search time per scheduler action.
67+
// Keys are action names, with "default" used as the fallback budget.
68+
MaxActionSearchDuration map[string]metav1.Duration `json:"maxActionSearchDuration,omitempty"`
69+
70+
// MaxJobSearchDuration limits total scenario search time per pending job.
71+
MaxJobSearchDuration *metav1.Duration `json:"maxJobSearchDuration,omitempty"`
72+
73+
// MinJobSearchDuration guarantees each pending job this much scenario search time
74+
// before action and generator budgets can stop the job's search.
75+
MinJobSearchDuration *metav1.Duration `json:"minJobSearchDuration,omitempty"`
76+
77+
// MaxGeneratorSearchDuration limits scenario search time per generator attempt.
78+
// Keys are generator names, with "default" used as the fallback budget.
79+
MaxGeneratorSearchDuration map[string]metav1.Duration `json:"maxGeneratorSearchDuration,omitempty"`
80+
}
81+
6382
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
6483

6584
// SchedulingShardSpec defines the desired state of SchedulingShard
@@ -100,6 +119,10 @@ type SchedulingShardSpec struct {
100119
// +kubebuilder:validation:Optional
101120
UsageDBConfig *usagedbapi.UsageDBConfig `yaml:"usageDBConfig,omitempty" json:"usageDBConfig,omitempty"`
102121

122+
// ScenarioSearchBudgets configures alpha/experimental time budgets for scenario search.
123+
// +kubebuilder:validation:Optional
124+
ScenarioSearchBudgets *ScenarioSearchBudgets `json:"scenarioSearchBudgets,omitempty"`
125+
103126
// Plugins allows overriding plugin configuration. Keys are plugin names.
104127
// Built-in plugins can be disabled, reordered, or have their arguments changed.
105128
// New plugins can be added by specifying a name not in the default set.
@@ -108,7 +131,8 @@ type SchedulingShardSpec struct {
108131
// resourcetype=1500, podaffinity=1400, elastic=1300, kubeflow=1200,
109132
// ray=1100, subgrouporder=1000, taskorder=900, nominatednode=800,
110133
// dynamicresources=700, minruntime=600, topology=500, snapshot=400,
111-
// gpupack/gpuspread=300, nodeplacement=200, gpusharingorder=100.
134+
// sg-nodelocalgreedy=360, sg-multinodegang=350, gpupack/gpuspread=300,
135+
// nodeplacement=200, gpusharingorder=100.
112136
// +kubebuilder:validation:Optional
113137
Plugins map[string]PluginConfig `json:"plugins,omitempty"`
114138

@@ -127,31 +151,73 @@ func (s *SchedulingShardSpec) SetDefaultsWhereNeeded() {
127151

128152
s.setDefaultPlugins()
129153
s.setDefaultActions()
154+
s.ScenarioSearchBudgets = DefaultScenarioSearchBudgets(s.ScenarioSearchBudgets)
155+
}
156+
157+
func DefaultScenarioSearchBudgets(config *ScenarioSearchBudgets) *ScenarioSearchBudgets {
158+
if config == nil {
159+
config = &ScenarioSearchBudgets{}
160+
}
161+
if config.MaxActionSearchDuration == nil {
162+
config.MaxActionSearchDuration = map[string]metav1.Duration{}
163+
}
164+
if _, found := config.MaxActionSearchDuration[constants.ActionDefault]; !found {
165+
config.MaxActionSearchDuration[constants.ActionDefault] = mustParseScenarioSearchDuration(constants.DefaultActionBudget)
166+
}
167+
if config.MaxJobSearchDuration == nil {
168+
config.MaxJobSearchDuration = ptr.To(mustParseScenarioSearchDuration(constants.DefaultJobBudget))
169+
}
170+
if config.MinJobSearchDuration == nil {
171+
config.MinJobSearchDuration = ptr.To(mustParseScenarioSearchDuration(constants.DefaultMinJobBudget))
172+
}
173+
if config.MaxGeneratorSearchDuration == nil {
174+
config.MaxGeneratorSearchDuration = map[string]metav1.Duration{}
175+
}
176+
if _, found := config.MaxGeneratorSearchDuration[constants.ActionDefault]; !found {
177+
config.MaxGeneratorSearchDuration[constants.ActionDefault] = mustParseScenarioSearchDuration(constants.DefaultGeneratorBudget)
178+
}
179+
if _, found := config.MaxGeneratorSearchDuration[constants.GeneratorNodeLocalGreedy]; !found {
180+
config.MaxGeneratorSearchDuration[constants.GeneratorNodeLocalGreedy] = mustParseScenarioSearchDuration(constants.DefaultNodeLocalGreedy)
181+
}
182+
if _, found := config.MaxGeneratorSearchDuration[constants.GeneratorMultiNodeGang]; !found {
183+
config.MaxGeneratorSearchDuration[constants.GeneratorMultiNodeGang] = mustParseScenarioSearchDuration(constants.DefaultMultiNodeGang)
184+
}
185+
return config
186+
}
187+
188+
func mustParseScenarioSearchDuration(value string) metav1.Duration {
189+
duration, err := time.ParseDuration(value)
190+
if err != nil {
191+
panic(err)
192+
}
193+
return metav1.Duration{Duration: duration}
130194
}
131195

132196
// Default priorities preserve the current hardcoded ordering.
133197
// Higher priority = runs first. Spaced by 100.
134198
var defaultPluginPriorities = map[string]int{
135-
"predicates": 1900,
136-
"proportion": 1800,
137-
"priority": 1700,
138-
"nodeavailability": 1600,
139-
"resourcetype": 1500,
140-
"podaffinity": 1400,
141-
"elastic": 1300,
142-
"kubeflow": 1200,
143-
"ray": 1100,
144-
"subgrouporder": 1000,
145-
"taskorder": 900,
146-
"nominatednode": 800,
147-
"dynamicresources": 700,
148-
"minruntime": 600,
149-
"topology": 500,
150-
"snapshot": 400,
151-
"gpupack": 300,
152-
"gpuspread": 300,
153-
"nodeplacement": 200,
154-
"gpusharingorder": 100,
199+
"predicates": 1900,
200+
"proportion": 1800,
201+
"priority": 1700,
202+
"nodeavailability": 1600,
203+
"resourcetype": 1500,
204+
"podaffinity": 1400,
205+
"elastic": 1300,
206+
"kubeflow": 1200,
207+
"ray": 1100,
208+
"subgrouporder": 1000,
209+
"taskorder": 900,
210+
"nominatednode": 800,
211+
"dynamicresources": 700,
212+
"minruntime": 600,
213+
"topology": 500,
214+
"snapshot": 400,
215+
"sg-nodelocalgreedy": 360,
216+
"sg-multinodegang": 350,
217+
"gpupack": 300,
218+
"gpuspread": 300,
219+
"nodeplacement": 200,
220+
"gpusharingorder": 100,
155221
}
156222

157223
var defaultActionPriorities = map[string]int{

pkg/apis/kai/v1/zz_generated.deepcopy.go

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/common/constants/constants.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,21 @@ const (
2727
DefaultRuntimeClassName = "nvidia"
2828
DefaultStuckInReleasingThreshold = 2 * time.Minute
2929

30+
ActionDefault = "default"
31+
ActionReclaim = "reclaim"
32+
ActionPreempt = "preempt"
33+
ActionConsolidation = "consolidation"
34+
35+
GeneratorNodeLocalGreedy = "NodeLocalGreedy"
36+
GeneratorMultiNodeGang = "MultiNodeGang"
37+
38+
DefaultActionBudget = "5m"
39+
DefaultJobBudget = "4m"
40+
DefaultMinJobBudget = "0s"
41+
DefaultGeneratorBudget = "2m"
42+
DefaultNodeLocalGreedy = "30s"
43+
DefaultMultiNodeGang = "2m"
44+
3045
DefaultCPUWorkerNodeLabelKey = "node-role.kubernetes.io/cpu-worker"
3146
DefaultGPUWorkerNodeLabelKey = "node-role.kubernetes.io/gpu-worker"
3247
DefaultMIGWorkerNodeLabelKey = "node-role.kubernetes.io/mig-enabled"

0 commit comments

Comments
 (0)