Skip to content

Commit 38a4ded

Browse files
committed
feat(scheduler): explain unresolved scenario searches
Signed-off-by: Erez Freiberger <enoodle@gmail.com>
1 parent cdeb81d commit 38a4ded

11 files changed

Lines changed: 416 additions & 9 deletions

File tree

pkg/apis/scheduling/v2alpha2/podgroup_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,9 @@ type SchedulingConditionType string
283283
const (
284284
// UnschedulableOnNodePool means the pod group is Unschedulable on the current node pool
285285
UnschedulableOnNodePool SchedulingConditionType = "UnschedulableOnNodePool"
286+
287+
// ScenarioSearchUnresolved means bounded scenario search did not resolve a pending job in the current scheduling attempt
288+
ScenarioSearchUnresolved SchedulingConditionType = "ScenarioSearchUnresolved"
286289
)
287290

288291
// These are reasons for a pod group's transition to a condition.

pkg/scheduler/actions/common/solvers/search_result.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
package solvers
55

6+
import "github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/podgroup_info"
7+
68
// SearchResultReason describes why a scenario search stopped.
79
type SearchResultReason string
810

@@ -54,6 +56,39 @@ func (r *SearchResult) scenarioSearchMetricResult() string {
5456
return string(r.reason)
5557
}
5658

59+
func (r *SearchResult) ScenarioSearchUnresolved() *podgroup_info.ScenarioSearchUnresolved {
60+
if r == nil {
61+
return nil
62+
}
63+
64+
var reason podgroup_info.ScenarioSearchResultReason
65+
switch r.reason {
66+
case SearchResultDeadlineExhausted:
67+
reason = podgroup_info.ScenarioSearchResultDeadlineExhausted
68+
case SearchResultGeneratorsExhausted:
69+
reason = podgroup_info.ScenarioSearchResultGeneratorsExhausted
70+
case SearchResultNoGenerator:
71+
reason = podgroup_info.ScenarioSearchResultNoGenerator
72+
case SearchResultNotAttempted:
73+
reason = podgroup_info.ScenarioSearchResultNotAttempted
74+
default:
75+
return nil
76+
}
77+
78+
return &podgroup_info.ScenarioSearchUnresolved{
79+
Reason: reason,
80+
ReducedBudget: r.reducedBudget,
81+
}
82+
}
83+
84+
func RecordScenarioSearchUnresolved(job *podgroup_info.PodGroupInfo, result *SearchResult) {
85+
unresolved := result.ScenarioSearchUnresolved()
86+
if unresolved == nil {
87+
return
88+
}
89+
job.SetScenarioSearchUnresolved(unresolved.Reason, unresolved.ReducedBudget)
90+
}
91+
5792
// NewNotAttemptedSearchResult returns a terminal result for callers that skip solver entry.
5893
func NewNotAttemptedSearchResult() *SearchResult {
5994
return terminalSearchResult(SearchResultNotAttempted, false, false)
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
// Copyright 2026 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package solvers
5+
6+
import (
7+
"testing"
8+
9+
"github.com/stretchr/testify/require"
10+
11+
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/podgroup_info"
12+
)
13+
14+
func TestSearchResultScenarioSearchUnresolved(t *testing.T) {
15+
tests := []struct {
16+
name string
17+
result *SearchResult
18+
expectedReason podgroup_info.ScenarioSearchResultReason
19+
reducedBudget bool
20+
}{
21+
{
22+
name: "deadline exhausted",
23+
result: terminalSearchResult(SearchResultDeadlineExhausted, false, true),
24+
expectedReason: podgroup_info.ScenarioSearchResultDeadlineExhausted,
25+
},
26+
{
27+
name: "generators exhausted",
28+
result: terminalSearchResult(SearchResultGeneratorsExhausted, false, true),
29+
expectedReason: podgroup_info.ScenarioSearchResultGeneratorsExhausted,
30+
},
31+
{
32+
name: "no generator",
33+
result: terminalSearchResult(SearchResultNoGenerator, false, false),
34+
expectedReason: podgroup_info.ScenarioSearchResultNoGenerator,
35+
},
36+
{
37+
name: "not attempted",
38+
result: terminalSearchResult(SearchResultNotAttempted, false, false),
39+
expectedReason: podgroup_info.ScenarioSearchResultNotAttempted,
40+
},
41+
{
42+
name: "reduced budget",
43+
result: terminalSearchResult(SearchResultDeadlineExhausted, true, true),
44+
expectedReason: podgroup_info.ScenarioSearchResultDeadlineExhausted,
45+
reducedBudget: true,
46+
},
47+
}
48+
49+
for _, tt := range tests {
50+
t.Run(tt.name, func(t *testing.T) {
51+
unresolved := tt.result.ScenarioSearchUnresolved()
52+
53+
require.NotNil(t, unresolved)
54+
require.Equal(t, tt.expectedReason, unresolved.Reason)
55+
require.Equal(t, tt.reducedBudget, unresolved.ReducedBudget)
56+
})
57+
}
58+
}
59+
60+
func TestSearchResultScenarioSearchUnresolvedIgnoresSolvedAndNilResults(t *testing.T) {
61+
require.Nil(t, solvedSearchResult(&solutionResult{solved: true}, false).ScenarioSearchUnresolved())
62+
63+
var result *SearchResult
64+
require.Nil(t, result.ScenarioSearchUnresolved())
65+
}

pkg/scheduler/actions/consolidation/consolidation.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,11 @@ func (alloc *consolidationAction) Execute(ssn *framework.Session) {
8282
if err != nil {
8383
log.InfraLogger.Errorf("Failed to commit consolidation statement: %v", err)
8484
}
85-
} else if shouldStopActionForSearchResult(searchResult) {
86-
return
8785
} else {
86+
solvers.RecordScenarioSearchUnresolved(job, searchResult)
87+
if shouldStopActionForSearchResult(searchResult) {
88+
return
89+
}
8890
smallestFailedJobs.UpdateRepresentative(job)
8991
}
9092
}

pkg/scheduler/actions/preempt/preempt.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,11 @@ func (alloc *preemptAction) Execute(ssn *framework.Session) {
9999
if err := statement.Commit(); err != nil {
100100
log.InfraLogger.Errorf("Failed to commit preemption statement: %v", err)
101101
}
102-
} else if shouldStopActionForSearchResult(searchResult) {
103-
return
104102
} else {
103+
solvers.RecordScenarioSearchUnresolved(job, searchResult)
104+
if shouldStopActionForSearchResult(searchResult) {
105+
return
106+
}
105107
log.InfraLogger.V(3).Infof("Didn't find a preemption strategy for job <%s/%s>",
106108
job.Namespace, job.Name)
107109
smallestFailedJobs.UpdateRepresentative(job)

pkg/scheduler/actions/reclaim/reclaim.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,11 @@ func (ra *reclaimAction) Execute(ssn *framework.Session) {
102102
if err := statement.Commit(); err != nil {
103103
log.InfraLogger.Errorf("Failed to commit reclaim statement: %v", err)
104104
}
105-
} else if shouldStopActionForSearchResult(searchResult) {
106-
return
107105
} else {
106+
solvers.RecordScenarioSearchUnresolved(job, searchResult)
107+
if shouldStopActionForSearchResult(searchResult) {
108+
return
109+
}
108110
log.InfraLogger.V(3).Infof("Didn't find a reclaim strategy for job <%s/%s>",
109111
job.Namespace, job.Name)
110112
smallestFailedJobs.UpdateRepresentative(job)

pkg/scheduler/actions/reclaim/reclaim_benchmark_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,20 @@ package reclaim_test
66
import (
77
"fmt"
88
"testing"
9+
"time"
910

11+
"github.com/stretchr/testify/require"
1012
. "go.uber.org/mock/gomock"
1113
"gopkg.in/h2non/gock.v1"
1214
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1315

16+
kaiv1 "github.com/kai-scheduler/KAI-scheduler/pkg/apis/kai/v1"
1417
kaiv1alpha1 "github.com/kai-scheduler/KAI-scheduler/pkg/apis/kai/v1alpha1"
18+
commonconstants "github.com/kai-scheduler/KAI-scheduler/pkg/common/constants"
1519
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/actions/reclaim"
20+
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/common_info"
1621
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/pod_status"
22+
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/podgroup_info"
1723
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/podgroup_info/subgroup_info"
1824
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/topology_info"
1925
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/constants"
@@ -46,6 +52,35 @@ const (
4652
unschedulableDistributedRackKey = "benchmark.kai.scheduler/rack"
4753
)
4854

55+
func TestReducedBudgetFailedReclaimRecordsScenarioSearchUnresolved(t *testing.T) {
56+
defer gock.Off()
57+
58+
test_utils.InitTestingInfrastructure()
59+
controller := NewController(t)
60+
defer controller.Finish()
61+
62+
topology := buildUnschedulableDistributedReclaimBenchmarkTopology(
63+
defaultUnschedulableDistributedReclaimBenchmarkParams(10),
64+
)
65+
ssn := test_utils.BuildSession(topology, controller)
66+
ssn.Config.ScenarioSearchBudgets = &kaiv1.ScenarioSearchBudgets{
67+
MaxActionSearchDuration: map[string]metav1.Duration{
68+
commonconstants.ActionReclaim: {Duration: 250 * time.Millisecond},
69+
},
70+
MaxJobSearchDuration: &metav1.Duration{Duration: time.Second},
71+
MinJobSearchDuration: &metav1.Duration{Duration: 500 * time.Millisecond},
72+
}
73+
74+
reclaim.New().Execute(ssn)
75+
76+
job := ssn.ClusterInfo.PodGroupInfos[common_info.PodGroupID("unschedulable-distributed-job")]
77+
require.NotNil(t, job)
78+
require.Empty(t, job.JobFitErrors)
79+
require.NotNil(t, job.ScenarioSearchUnresolved)
80+
require.Equal(t, podgroup_info.ScenarioSearchResultGeneratorsExhausted, job.ScenarioSearchUnresolved.Reason)
81+
require.True(t, job.ScenarioSearchUnresolved.ReducedBudget)
82+
}
83+
4984
func BenchmarkReclaimUnschedulableDistributedJob_10Node(b *testing.B) {
5085
benchmarkReclaimUnschedulableDistributedJob(b, 10)
5186
}

pkg/scheduler/api/podgroup_info/job_info.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,20 @@ type StalenessInfo struct {
5151
Stale bool
5252
}
5353

54+
type ScenarioSearchResultReason string
55+
56+
const (
57+
ScenarioSearchResultDeadlineExhausted ScenarioSearchResultReason = "deadline_exhausted"
58+
ScenarioSearchResultGeneratorsExhausted ScenarioSearchResultReason = "generators_exhausted"
59+
ScenarioSearchResultNoGenerator ScenarioSearchResultReason = "no_generator"
60+
ScenarioSearchResultNotAttempted ScenarioSearchResultReason = "not_attempted"
61+
)
62+
63+
type ScenarioSearchUnresolved struct {
64+
Reason ScenarioSearchResultReason
65+
ReducedBudget bool
66+
}
67+
5468
type PodGroupInfos struct {
5569
PodGroupInfos []*PodGroupInfo
5670
}
@@ -70,6 +84,8 @@ type PodGroupInfo struct {
7084
JobFitErrors []common_info.JobFitError
7185
TasksFitErrors map[common_info.PodID]*common_info.TasksFitErrors
7286

87+
ScenarioSearchUnresolved *ScenarioSearchUnresolved
88+
7389
AllocatedVector resource_info.ResourceVector
7490
VectorMap *resource_info.ResourceVectorMap
7591

@@ -574,6 +590,13 @@ func (pgi *PodGroupInfo) AddJobFitError(err common_info.JobFitError) {
574590
pgi.JobFitErrors = append(pgi.JobFitErrors, err)
575591
}
576592

593+
func (pgi *PodGroupInfo) SetScenarioSearchUnresolved(reason ScenarioSearchResultReason, reducedBudget bool) {
594+
pgi.ScenarioSearchUnresolved = &ScenarioSearchUnresolved{
595+
Reason: reason,
596+
ReducedBudget: reducedBudget,
597+
}
598+
}
599+
577600
func (pgi *PodGroupInfo) GetSchedulingConstraintsSignature() common_info.SchedulingConstraintsSignature {
578601
if pgi.schedulingConstraintsSignature == "" {
579602
pgi.schedulingConstraintsSignature = pgi.generateSchedulingConstraintsSignature()

0 commit comments

Comments
 (0)