Skip to content

Commit 607c08b

Browse files
committed
perf(scheduler): avoid duplicate full job solver probe
Signed-off-by: Erez Freiberger <enoodle@gmail.com>
1 parent 79e14eb commit 607c08b

4 files changed

Lines changed: 55 additions & 18 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
2222
- Removed redundant `PodDisruptionBudgetImplemented` guard from operator PDB creation helper [#1613](https://github.com/kai-scheduler/KAI-Scheduler/pull/1613) [dttung2905](https://github.com/dttung2905)
2323
- Updated Go toolchain and base build images to v1.26.3.
2424
- **Breaking:** The podgroup produced for JobSet is now produces as a single PodGroup per JobSet with a two-level SubGroup hierarchy (one parent SubGroup per `replicatedJob`, one leaf SubGroup per replica) regardless of `startupPolicyOrder`. The `kai.scheduler/batch-min-member` annotation on the JobSet now overrides the root `minSubGroup`; the same annotation on `replicatedJobs[].template.metadata.annotations` overrides the leaf `minMember` (defaulting to `template.spec.parallelism`). [#1617](https://github.com/kai-scheduler/KAI-Scheduler/pull/1617) [davidLif](https://github.com/davidLif)
25+
- Optimized the job solver to run the full allocation probe only once after partial search finds at least one solvable pending task.
2526

2627
### Fixed
2728
- Reduced scheduler heap retention after scheduling cycles by clearing completed session snapshots and callback references, and by releasing the node scoring pool without waiting for finalizers.

pkg/scheduler/actions/common/solvers/job_solver.go

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -162,21 +162,23 @@ func (s *JobSolver) solvePendingJobWithGenerator(
162162
generatorBudget *generatorSearchBudget,
163163
) *SearchResult {
164164
n := len(tasksToAllocate)
165-
maxSolvedK, searchResult := s.searchMaxSolvableK(
166-
ssn, state, pendingJob, tasksToAllocate, jobBudget, availableGenerator, generatorBudget,
167-
)
168-
if maxSolvedK == 0 {
169-
if searchResult == nil {
170-
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget())
165+
if n > 1 {
166+
maxSolvedK, searchResult := s.searchMaxSolvableK(
167+
ssn, state, pendingJob, tasksToAllocate, jobBudget, availableGenerator, generatorBudget,
168+
)
169+
if maxSolvedK == 0 {
170+
if searchResult == nil {
171+
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget())
172+
}
173+
return searchResult
171174
}
172-
return searchResult
173175
}
174176

175177
result := s.probeAtK(ssn, state, pendingJob, tasksToAllocate, n, jobBudget, availableGenerator, generatorBudget)
176178
return result
177179
}
178180

179-
// searchMaxSolvableK returns the largest k in [0, n] for which a probe at k succeeds.
181+
// searchMaxSolvableK returns the largest k in [0, n) for which a probe at k succeeds.
180182
// Each probe is discarded before returning, so session state is clean on return.
181183
// Successful probes update hints in state for use by subsequent probes.
182184
// Complexity: O(log n) probes — exponential doubling to locate a failing k (or reach n),
@@ -191,7 +193,7 @@ func (s *JobSolver) searchMaxSolvableK(
191193
generatorBudget *generatorSearchBudget,
192194
) (int, *SearchResult) {
193195
n := len(tasksToAllocate)
194-
if n == 0 {
196+
if n <= 1 {
195197
return 0, nil
196198
}
197199

@@ -203,15 +205,15 @@ func (s *JobSolver) searchMaxSolvableK(
203205
}
204206

205207
func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchResult) {
206-
if n == 0 {
208+
if n <= 1 {
207209
return 0, nil
208210
}
209211

210212
lo := 0
211-
var hi int
213+
hi := n
212214
var lastUnsolvedResult *SearchResult
213215
k := 1
214-
for {
216+
for k < n {
215217
result := probe(k)
216218
if shouldStopSearch(result) {
217219
return 0, result
@@ -222,12 +224,10 @@ func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchRes
222224
break
223225
}
224226
lo = k
225-
if k == n {
226-
return n, lastUnsolvedResult
227-
}
228227
k *= 2
229-
if k > n {
230-
k = n
228+
if k >= n {
229+
hi = n
230+
break
231231
}
232232
}
233233

pkg/scheduler/actions/common/solvers/job_solver_result_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ func TestSolveWithResultRunsCompletePartialSearchForOneGeneratorBeforeNext(t *te
273273

274274
require.True(t, solved)
275275
require.Equal(t, SearchResultSolved, result.Reason())
276-
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3", "second:3"}, factoryCalls)
276+
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3"}, factoryCalls)
277277
}
278278

279279
func TestSearchMaxSolvableKStopsAfterTerminalPartialProbe(t *testing.T) {
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package solvers
5+
6+
import (
7+
"testing"
8+
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
func TestSearchMaxSolvableKSkipsFullProbe(t *testing.T) {
13+
probes := []int{}
14+
15+
maxSolvedK, result := searchMaxSolvableK(4, func(k int) *SearchResult {
16+
probes = append(probes, k)
17+
return solvedSearchResult(&solutionResult{solved: true}, false)
18+
})
19+
20+
require.Equal(t, 3, maxSolvedK)
21+
require.Nil(t, result)
22+
require.Equal(t, []int{1, 2, 3}, probes)
23+
}
24+
25+
func TestSearchMaxSolvableKSkipsSingleTaskFullProbe(t *testing.T) {
26+
probes := []int{}
27+
28+
maxSolvedK, result := searchMaxSolvableK(1, func(k int) *SearchResult {
29+
probes = append(probes, k)
30+
return solvedSearchResult(&solutionResult{solved: true}, false)
31+
})
32+
33+
require.Equal(t, 0, maxSolvedK)
34+
require.Nil(t, result)
35+
require.Empty(t, probes)
36+
}

0 commit comments

Comments
 (0)