Skip to content

Commit 62da29a

Browse files
committed
perf(scheduler): avoid duplicate full job solver probe
Signed-off-by: Erez Freiberger <enoodle@gmail.com>
1 parent b197399 commit 62da29a

4 files changed

Lines changed: 57 additions & 20 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
2222
- Removed redundant `PodDisruptionBudgetImplemented` guard from operator PDB creation helper [#1613](https://github.com/kai-scheduler/KAI-Scheduler/pull/1613) [dttung2905](https://github.com/dttung2905)
2323
- Updated Go toolchain and base build images to v1.26.3.
2424
- **Breaking:** The podgroup produced for JobSet is now produces as a single PodGroup per JobSet with a two-level SubGroup hierarchy (one parent SubGroup per `replicatedJob`, one leaf SubGroup per replica) regardless of `startupPolicyOrder`. The `kai.scheduler/batch-min-member` annotation on the JobSet now overrides the root `minSubGroup`; the same annotation on `replicatedJobs[].template.metadata.annotations` overrides the leaf `minMember` (defaulting to `template.spec.parallelism`). [#1617](https://github.com/kai-scheduler/KAI-Scheduler/pull/1617) [davidLif](https://github.com/davidLif)
25+
- Optimized the job solver to run the full allocation probe only once after partial search finds at least one solvable pending task.
2526

2627
### Fixed
2728
- Reduced scheduler heap retention after scheduling cycles by clearing completed session snapshots and callback references, and by releasing the node scoring pool without waiting for finalizers.

pkg/scheduler/actions/common/solvers/job_solver.go

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -178,16 +178,18 @@ func (s *JobSolver) solvePendingJobWithGenerator(
178178
) *SearchResult {
179179
n := len(tasksToAllocate)
180180
enteredSearch := false
181-
maxSolvedK, searchResult := s.searchMaxSolvableK(
182-
ssn, state, pendingJob, tasksToAllocate, jobBudget, registration, generatorBudget,
183-
)
184-
enteredSearch = searchResultEntered(searchResult) || maxSolvedK > 0
185-
if maxSolvedK == 0 {
186-
if searchResult == nil {
187-
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget(), false)
181+
if n > 1 {
182+
maxSolvedK, searchResult := s.searchMaxSolvableK(
183+
ssn, state, pendingJob, tasksToAllocate, jobBudget, registration, generatorBudget,
184+
)
185+
enteredSearch = searchResultEntered(searchResult) || maxSolvedK > 0
186+
if maxSolvedK == 0 {
187+
if searchResult == nil {
188+
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget(), false)
189+
}
190+
preserveEnteredSearch(searchResult, enteredSearch)
191+
return searchResult
188192
}
189-
preserveEnteredSearch(searchResult, enteredSearch)
190-
return searchResult
191193
}
192194

193195
result := s.probeAtK(ssn, state, pendingJob, tasksToAllocate, n, jobBudget, registration, generatorBudget)
@@ -197,7 +199,7 @@ func (s *JobSolver) solvePendingJobWithGenerator(
197199
return result
198200
}
199201

200-
// searchMaxSolvableK returns the largest k in [0, n] for which a probe at k succeeds.
202+
// searchMaxSolvableK returns the largest k in [0, n) for which a probe at k succeeds.
201203
// Each probe is discarded before returning, so session state is clean on return.
202204
// Successful probes update hints in state for use by subsequent probes.
203205
// Complexity: O(log n) probes — exponential doubling to locate a failing k (or reach n),
@@ -212,7 +214,7 @@ func (s *JobSolver) searchMaxSolvableK(
212214
generatorBudget *generatorSearchBudget,
213215
) (int, *SearchResult) {
214216
n := len(tasksToAllocate)
215-
if n == 0 {
217+
if n <= 1 {
216218
return 0, nil
217219
}
218220

@@ -224,16 +226,16 @@ func (s *JobSolver) searchMaxSolvableK(
224226
}
225227

226228
func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchResult) {
227-
if n == 0 {
229+
if n <= 1 {
228230
return 0, nil
229231
}
230232

231233
lo := 0
232-
var hi int
234+
hi := n
233235
var lastUnsolvedResult *SearchResult
234236
enteredSearch := false
235237
k := 1
236-
for {
238+
for k < n {
237239
result := probe(k)
238240
enteredSearch = enteredSearch || searchResultEntered(result) || resultSolved(result)
239241
if shouldStopSearch(result) {
@@ -246,12 +248,10 @@ func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchRes
246248
break
247249
}
248250
lo = k
249-
if k == n {
250-
return n, lastUnsolvedResult
251-
}
252251
k *= 2
253-
if k > n {
254-
k = n
252+
if k >= n {
253+
hi = n
254+
break
255255
}
256256
}
257257

pkg/scheduler/actions/common/solvers/job_solver_result_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ func TestSolveWithResultRunsCompletePartialSearchForOneGeneratorBeforeNext(t *te
243243

244244
require.True(t, solved)
245245
require.Equal(t, SearchResultSolved, result.Reason())
246-
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3", "second:3"}, factoryCalls)
246+
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3"}, factoryCalls)
247247
}
248248

249249
func TestSolveWithResultReportsDeadlineBeforeScenarioSimulation(t *testing.T) {
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package solvers
5+
6+
import (
7+
"testing"
8+
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
func TestSearchMaxSolvableKSkipsFullProbe(t *testing.T) {
13+
probes := []int{}
14+
15+
maxSolvedK, result := searchMaxSolvableK(4, func(k int) *SearchResult {
16+
probes = append(probes, k)
17+
return solvedSearchResult(&solutionResult{solved: true}, false)
18+
})
19+
20+
require.Equal(t, 3, maxSolvedK)
21+
require.Nil(t, result)
22+
require.Equal(t, []int{1, 2, 3}, probes)
23+
}
24+
25+
func TestSearchMaxSolvableKSkipsSingleTaskFullProbe(t *testing.T) {
26+
probes := []int{}
27+
28+
maxSolvedK, result := searchMaxSolvableK(1, func(k int) *SearchResult {
29+
probes = append(probes, k)
30+
return solvedSearchResult(&solutionResult{solved: true}, false)
31+
})
32+
33+
require.Equal(t, 0, maxSolvedK)
34+
require.Nil(t, result)
35+
require.Empty(t, probes)
36+
}

0 commit comments

Comments
 (0)