Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Removed redundant `PodDisruptionBudgetImplemented` guard from operator PDB creation helper [#1613](https://github.com/kai-scheduler/KAI-Scheduler/pull/1613) [dttung2905](https://github.com/dttung2905)
- Updated Go toolchain and base build images to v1.26.3.
- **Breaking:** The podgroup produced for JobSet is now produces as a single PodGroup per JobSet with a two-level SubGroup hierarchy (one parent SubGroup per `replicatedJob`, one leaf SubGroup per replica) regardless of `startupPolicyOrder`. The `kai.scheduler/batch-min-member` annotation on the JobSet now overrides the root `minSubGroup`; the same annotation on `replicatedJobs[].template.metadata.annotations` overrides the leaf `minMember` (defaulting to `template.spec.parallelism`). [#1617](https://github.com/kai-scheduler/KAI-Scheduler/pull/1617) [davidLif](https://github.com/davidLif)
- Optimized the job solver to run the full allocation probe only once after partial search finds at least one solvable pending task.

### Fixed
- Reduced scheduler heap retention after scheduling cycles by clearing completed session snapshots and callback references, and by releasing the node scoring pool without waiting for finalizers.
Expand Down
31 changes: 15 additions & 16 deletions pkg/scheduler/actions/common/solvers/job_solver.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,21 +162,23 @@ func (s *JobSolver) solvePendingJobWithGenerator(
generatorBudget *generatorSearchBudget,
) *SearchResult {
n := len(tasksToAllocate)
maxSolvedK, searchResult := s.searchMaxSolvableK(
ssn, state, pendingJob, tasksToAllocate, jobBudget, availableGenerator, generatorBudget,
)
if maxSolvedK == 0 {
if searchResult == nil {
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget())
if n > 1 {
maxSolvedK, searchResult := s.searchMaxSolvableK(
ssn, state, pendingJob, tasksToAllocate, jobBudget, availableGenerator, generatorBudget,
)
if maxSolvedK == 0 {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does it have to be equal to 0 instead of smaller then n-1? Why should we attempt to allocate all 3 pods if we cann't find a solution for 2?

if searchResult == nil {
searchResult = terminalSearchResult(SearchResultGeneratorsExhausted, jobBudget.ReducedBudget())
}
return searchResult
}
return searchResult
}

result := s.probeAtK(ssn, state, pendingJob, tasksToAllocate, n, jobBudget, availableGenerator, generatorBudget)
return result
}

// searchMaxSolvableK returns the largest k in [0, n] for which a probe at k succeeds.
// searchMaxSolvableK returns the largest k in [0, n) for which a probe at k succeeds.
// Each probe is discarded before returning, so session state is clean on return.
// Successful probes update hints in state for use by subsequent probes.
// Complexity: O(log n) probes — exponential doubling to locate a failing k (or reach n),
Expand All @@ -191,7 +193,7 @@ func (s *JobSolver) searchMaxSolvableK(
generatorBudget *generatorSearchBudget,
) (int, *SearchResult) {
n := len(tasksToAllocate)
if n == 0 {
if n <= 1 {
return 0, nil
}

Expand All @@ -203,15 +205,15 @@ func (s *JobSolver) searchMaxSolvableK(
}

func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchResult) {
if n == 0 {
if n <= 1 {
return 0, nil
}

lo := 0
var hi int
var lastUnsolvedResult *SearchResult
k := 1
for {
for k < n {
result := probe(k)
if shouldStopSearch(result) {
return 0, result
Expand All @@ -222,12 +224,9 @@ func searchMaxSolvableK(n int, probe func(k int) *SearchResult) (int, *SearchRes
break
}
lo = k
if k == n {
return n, lastUnsolvedResult
}
k *= 2
if k > n {
k = n
if k >= n {
return lo, lastUnsolvedResult
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ func TestSolveWithResultRunsCompletePartialSearchForOneGeneratorBeforeNext(t *te

require.True(t, solved)
require.Equal(t, SearchResultSolved, result.Reason())
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3", "second:3"}, factoryCalls)
require.Equal(t, []string{"first:1", "second:1", "second:2", "second:3"}, factoryCalls)
}

func TestSearchMaxSolvableKStopsAfterTerminalPartialProbe(t *testing.T) {
Expand Down
36 changes: 36 additions & 0 deletions pkg/scheduler/actions/common/solvers/job_solver_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright 2025 NVIDIA CORPORATION
// SPDX-License-Identifier: Apache-2.0

package solvers

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestSearchMaxSolvableKSkipsFullProbe(t *testing.T) {
probes := []int{}

maxSolvedK, result := searchMaxSolvableK(4, func(k int) *SearchResult {
probes = append(probes, k)
return solvedSearchResult(&solutionResult{solved: true}, false)
})

require.Equal(t, 2, maxSolvedK)
require.Nil(t, result)
require.Equal(t, []int{1, 2}, probes)
}

func TestSearchMaxSolvableKSkipsSingleTaskFullProbe(t *testing.T) {
probes := []int{}

maxSolvedK, result := searchMaxSolvableK(1, func(k int) *SearchResult {
probes = append(probes, k)
return solvedSearchResult(&solutionResult{solved: true}, false)
})

require.Equal(t, 0, maxSolvedK)
require.Nil(t, result)
require.Empty(t, probes)
}
Loading