Skip to content

Commit c9de803

Browse files
committed
perf(scheduler): avoid duplicate full job solver probe
Signed-off-by: Erez Freiberger <enoodle@gmail.com>
1 parent 5e6c486 commit c9de803

4 files changed

Lines changed: 55 additions & 18 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
2222
- Removed redundant `PodDisruptionBudgetImplemented` guard from operator PDB creation helper [#1613](https://github.com/kai-scheduler/KAI-Scheduler/pull/1613) [dttung2905](https://github.com/dttung2905)
2323
- Updated Go toolchain and base build images to v1.26.3.
2424
- **Breaking:** The podgroup produced for JobSet is now produces as a single PodGroup per JobSet with a two-level SubGroup hierarchy (one parent SubGroup per `replicatedJob`, one leaf SubGroup per replica) regardless of `startupPolicyOrder`. The `kai.scheduler/batch-min-member` annotation on the JobSet now overrides the root `minSubGroup`; the same annotation on `replicatedJobs[].template.metadata.annotations` overrides the leaf `minMember` (defaulting to `template.spec.parallelism`). [#1617](https://github.com/kai-scheduler/KAI-Scheduler/pull/1617) [davidLif](https://github.com/davidLif)
25+
- Optimized the job solver to run the full allocation probe only once after partial search finds at least one solvable pending task.
2526

2627
### Fixed
2728
- Reduced scheduler heap retention after scheduling cycles by clearing completed session snapshots and callback references, and by releasing the node scoring pool without waiting for finalizers.

pkg/scheduler/actions/common/solvers/job_solver.go

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -165,21 +165,23 @@ func (s *JobSolver) solvePendingJobWithGenerator(
165165
generatorBudget *generatorSearchBudget,
166166
) *SearchResult {
167167
n := len(tasksToAllocate)
168-
maxSolvedK, searchResult := s.searchMaxSolvableK(
169-
ssn, state, pendingJob, tasksToAllocate, jobBudget, availableGenerator, generatorBudget,
170-
)
171-
if maxSolvedK == 0 {
172-
if searchResult == nil {
173-
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget())
168+
if n > 1 {
169+
maxSolvedK, searchResult := s.searchMaxSolvableK(
170+
ssn, state, pendingJob, tasksToAllocate, jobBudget, availableGenerator, generatorBudget,
171+
)
172+
if maxSolvedK == 0 {
173+
if searchResult == nil {
174+
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget())
175+
}
176+
return searchResult
174177
}
175-
return searchResult
176178
}
177179

178180
result := s.probeAtK(ssn, state, pendingJob, tasksToAllocate, n, jobBudget, availableGenerator, generatorBudget)
179181
return result
180182
}
181183

182-
// searchMaxSolvableK returns the largest k in [0, n] for which a probe at k succeeds.
184+
// searchMaxSolvableK returns the largest k in [0, n) for which a probe at k succeeds.
183185
// Each probe is discarded before returning, so session state is clean on return.
184186
// Successful probes update hints in state for use by subsequent probes.
185187
// Complexity: O(log n) probes — exponential doubling to locate a failing k (or reach n),
@@ -194,7 +196,7 @@ func (s *JobSolver) searchMaxSolvableK(
194196
generatorBudget *generatorSearchBudget,
195197
) (int, *SearchResult) {
196198
n := len(tasksToAllocate)
197-
if n == 0 {
199+
if n <= 1 {
198200
return 0, nil
199201
}
200202

@@ -206,15 +208,15 @@ func (s *JobSolver) searchMaxSolvableK(
206208
}
207209

208210
func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchResult) {
209-
if n == 0 {
211+
if n <= 1 {
210212
return 0, nil
211213
}
212214

213215
lo := 0
214-
var hi int
216+
hi := n
215217
var lastUnsolvedResult *SearchResult
216218
k := 1
217-
for {
219+
for k < n {
218220
result := probe(k)
219221
if shouldStopSearch(result) {
220222
return 0, result
@@ -225,12 +227,10 @@ func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchRes
225227
break
226228
}
227229
lo = k
228-
if k == n {
229-
return n, lastUnsolvedResult
230-
}
231230
k *= 2
232-
if k > n {
233-
k = n
231+
if k >= n {
232+
hi = n
233+
break
234234
}
235235
}
236236

pkg/scheduler/actions/common/solvers/job_solver_result_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ func TestSolveWithResultRunsCompletePartialSearchForOneGeneratorBeforeNext(t *te
281281

282282
require.True(t, solved)
283283
require.Equal(t, SearchResultSolved, result.Reason())
284-
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3", "second:3"}, factoryCalls)
284+
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3"}, factoryCalls)
285285
}
286286

287287
func TestSearchMaxSolvableKStopsAfterTerminalPartialProbe(t *testing.T) {
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package solvers
5+
6+
import (
7+
"testing"
8+
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
func TestSearchMaxSolvableKSkipsFullProbe(t *testing.T) {
13+
probes := []int{}
14+
15+
maxSolvedK, result := searchMaxSolvableK(4, func(k int) *SearchResult {
16+
probes = append(probes, k)
17+
return solvedSearchResult(&solutionResult{solved: true}, false)
18+
})
19+
20+
require.Equal(t, 3, maxSolvedK)
21+
require.Nil(t, result)
22+
require.Equal(t, []int{1, 2, 3}, probes)
23+
}
24+
25+
func TestSearchMaxSolvableKSkipsSingleTaskFullProbe(t *testing.T) {
26+
probes := []int{}
27+
28+
maxSolvedK, result := searchMaxSolvableK(1, func(k int) *SearchResult {
29+
probes = append(probes, k)
30+
return solvedSearchResult(&solutionResult{solved: true}, false)
31+
})
32+
33+
require.Equal(t, 0, maxSolvedK)
34+
require.Nil(t, result)
35+
require.Empty(t, probes)
36+
}

0 commit comments

Comments
 (0)