kai-scheduler
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/metrics/METRICS.md‎
Lines changed: 11 additions & 0 deletions b/‎docs/metrics/METRICS.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎go.mod‎
Lines changed: 1 addition & 1 deletion b/‎go.mod‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/scheduler/actions/common/solvers/job_solver.go‎
Lines changed: 46 additions & 2 deletions b/‎pkg/scheduler/actions/common/solvers/job_solver.go‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎pkg/scheduler/actions/common/solvers/job_solver_result_test.go‎
Lines changed: 128 additions & 0 deletions b/‎pkg/scheduler/actions/common/solvers/job_solver_result_test.go‎
Lines changed: 128 additions & 0 deletions
@@ -7,7 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [Unreleased]
 
 ### Added
-- Added built-in `NodeLocalGreedy` and `MultiNodeGang` scenario generator implementations for bounded reclaim, preempt, and consolidation search.
+- Added a bounded scenario generator portfolio for reclaim, preempt, and consolidation search, with `SchedulingShard.spec.scenarioSearchBudgets` time-budget configuration and production scenario-search metrics.
 - Added an opt-in `deviceaccess` admission plugin (`--block-nvidia-visible-devices`, config field `admission.blockNvidiaVisibleDevices`, default disabled) that (1) rejects pods overriding the `NVIDIA_VISIBLE_DEVICES` environment variable with values other than `void`/`none` (or via a `valueFrom` reference), and (2) injects `NVIDIA_VISIBLE_DEVICES=void` into containers that do not request a GPU, blocking their access to GPUs on the node.
 - Added support for configuring admission Pod Disruption Budget via Helm values (`admission.podDisruptionBudget`) [#1490](https://github.com/kai-scheduler/KAI-Scheduler/pull/1490) [dttung2905](https://github.com/dttung2905)
 - Added an opt-in `hamicore` binder plugin (depends on `gpusharing`) to write the HAMI-core GPU memory limit (`CUDA_DEVICE_MEMORY_LIMIT`) for fractional GPU pods.
 
@@ -59,6 +59,13 @@ Metrics related to the core scheduling algorithm performance, task lifecycle, an
 | `scenarios_filtered_by_action` | Counter | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `action` | Cumulative count of simulation scenarios filtered/rejected by each action. |
 | `total_preemption_attempts` | Counter | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service` | Cumulative total of preemption attempts across the entire cluster lifetime. |
 | `pod_group_evicted_pods_total` | Counter | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `podgroup`, `uid`, `nodepool`, `action` | Cumulative count of pods evicted per pod group, tracked by nodepool and action. |
+| `scenario_search_jobs_total` | Counter | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `action`, `result`, `reduced_budget` | Cumulative count of jobs considered by bounded scenario search, grouped by scheduling action, terminal search result, and whether the job ran after the action budget was reduced. |
+| `scenario_search_action_budget_configured_seconds` | Gauge | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `action` | Configured scenario-search budget for each scheduling action in seconds. A value of 0 means unlimited. |
+| `scenario_search_job_budget_configured_seconds` | Gauge | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service` | Configured per-job scenario-search budget in seconds. A value of 0 means unlimited. |
+| `scenario_search_generator_budget_configured_seconds` | Gauge | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `generator` | Configured per-generator scenario-search budget in seconds. A value of 0 means unlimited. |
+| `scenario_search_action_budget_exhausted_total` | Counter | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `action` | Cumulative count of action-level scenario-search budget exhaustion events. |
+| `scenario_search_duration_seconds` | Histogram | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `action`, `generator`, `result` | Duration in seconds of generator scenario-search attempts. Buckets: [1ms, 2ms, 4ms, ..., 32.768s] (exponential). |
+| `scenario_search_scenarios_total` | Counter | `endpoint`, `instance`, `job`, `namespace`, `pod`, `service`, `action`, `generator`, `state` | Cumulative count of bounded-search scenarios emitted by generators, simulated by the solver, or rejected by validation. |
 
 ### Queue Fair-Share & Usage Metrics
 
@@ -88,6 +95,10 @@ Business/Resource Labels:
 - **`queue_metadata_name`**: The Queue resource's `metadata.name`. Always populated.
 - **`queue_display_name`**: The Queue's `spec.displayName`. Empty string when unset.
 - **`action`**: Scheduling action name
+- **`generator`**: Scenario generator name
+- **`result`**: Scenario search result (`solved`, `deadline_exhausted`, `generators_exhausted`, `no_generator`, `not_attempted`, `unsolved`, or `validator_rejected`, depending on the metric)
+- **`reduced_budget`**: Whether the scenario search ran after the action budget was reduced (`true` or `false`)
+- **`state`**: Scenario lifecycle state (`emitted`, `simulated`, or `validator_rejected`)
 - **`plugin`**: Plugin name
 - **`OnSession`**: Session lifecycle phase (`OnSessionOpen` or `OnSessionClose`)
 - **`podgroup`**: PodGroup resource identifier
 
@@ -24,6 +24,7 @@ require (
 	github.com/pkg/errors v0.9.1
 	github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.88.0
 	github.com/prometheus/client_golang v1.23.2
+	github.com/prometheus/client_model v0.6.2
 	github.com/prometheus/common v0.67.5
 	github.com/ray-project/kuberay/ray-operator v1.5.1
 	github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7
@@ -149,7 +150,6 @@ require (
 	github.com/opencontainers/selinux v1.13.0 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
-	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/procfs v0.20.1 // indirect
 	github.com/quic-go/qpack v0.6.0 // indirect
 	github.com/quic-go/quic-go v0.59.1 // indirect
 
@@ -9,6 +9,7 @@ import (
 	"time"
 
 	"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/actions/utils"
+	"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api"
 	"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/node_info"
 	"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/pod_info"
 	"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/podgroup_info"
@@ -84,12 +85,22 @@ func (s *JobSolver) Solve(
 func (s *JobSolver) SolveWithResult(
 	ssn *framework.Session, pendingJob *podgroup_info.PodGroupInfo,
 ) (solved bool, statement *framework.Statement, victimTaskNames []string, searchResult *SearchResult) {
+	defer func() {
+		if searchResult != nil {
+			metrics.IncScenarioSearchJobs(
+				s.actionType, searchResult.scenarioSearchMetricResult(), searchResult.ReducedBudget(),
+			)
+		}
+	}()
+
 	originalNumActiveTasks := pendingJob.GetNumActiveUsedTasks()
 
 	tasksToAllocate := podgroup_info.GetTasksToAllocate(pendingJob, ssn.SubGroupOrderFn, ssn.TaskOrderFn, false)
 	n := len(tasksToAllocate)
 	if n == 0 {
-		return false, nil, nil, terminalSearchResult(SearchResultGeneratorsExhausted, false)
+		searchResult := terminalSearchResult(SearchResultGeneratorsExhausted, false)
+		searchResult.metricResult = string(SearchResultNotAttempted)
+		return false, nil, nil, searchResult
 	}
 
 	jobBudget := s.actionBudget.BeginJob()
@@ -313,27 +324,60 @@ func (s *JobSolver) solvePartialJob(
 
 	for {
 		if jobBudget.Exhausted() {
+			s.observeActionBudgetExhausted()
 			return terminalSearchResult(SearchResultDeadlineExhausted, jobBudget.ReducedBudget())
 		}
 		scenarioToSolve := portfolio.Next()
 		if scenarioToSolve == nil {
 			break
 		}
-		scenarioSolver := newByPodSolver(feasibleNodeMap, s.solutionValidator, ssn.AllowConsolidatingReclaim(),
+		generatorName := portfolio.CurrentGeneratorName()
+		validatorRejected := false
+		scenarioSolver := newByPodSolver(feasibleNodeMap, s.solutionValidatorWithMetrics(generatorName, &validatorRejected),
+			ssn.AllowConsolidatingReclaim(),
 			s.actionType)
 
 		log.InfraLogger.V(5).Infof("Trying to solve scenario: %s", scenarioToSolve)
 		metrics.IncScenarioSimulatedByAction()
+		metrics.IncScenarioSearchScenario(s.actionType, generatorName, "simulated")
 
 		result := scenarioSolver.solve(ssn, scenarioToSolve)
+		attemptResult := scenarioSearchResultUnsolved
+		if validatorRejected {
+			attemptResult = scenarioSearchResultValidatorRejected
+		}
 		if result.solved {
+			portfolio.ObserveCurrentAttempt(string(SearchResultSolved))
 			return solvedSearchResult(result, jobBudget.ReducedBudget())
 		}
+		portfolio.ObserveCurrentAttempt(attemptResult)
 	}
 
 	return terminalSearchResult(portfolio.StopReason(), jobBudget.ReducedBudget())
 }
 
+func (s *JobSolver) observeActionBudgetExhausted() {
+	if s.actionBudget != nil && s.actionBudget.Exhausted() {
+		metrics.IncScenarioSearchActionBudgetExhausted(s.actionType)
+	}
+}
+
+func (s *JobSolver) solutionValidatorWithMetrics(generator string, rejected *bool) SolutionValidator {
+	if s.solutionValidator == nil {
+		return nil
+	}
+	return func(scenario api.ScenarioInfo) bool {
+		valid := s.solutionValidator(scenario)
+		if !valid {
+			if rejected != nil {
+				*rejected = true
+			}
+			metrics.IncScenarioSearchScenario(s.actionType, generator, "validator_rejected")
+		}
+		return valid
+	}
+}
+
 func shouldStopSearch(result *SearchResult) bool {
 	switch result.Reason() {
 	case SearchResultDeadlineExhausted, SearchResultNotAttempted, SearchResultNoGenerator:
 
@@ -8,6 +8,8 @@ import (
 	"testing"
 	"time"
 
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/require"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -47,6 +49,22 @@ func TestSolveWithResultReturnsTerminalResultWhenNoTasksToAllocate(t *testing.T)
 	require.False(t, result.ReducedBudget())
 }
 
+func TestSolveWithResultRecordsNoSearchMetricAsNotAttempted(t *testing.T) {
+	labels := map[string]string{
+		"action":         "reclaim",
+		"result":         string(SearchResultNotAttempted),
+		"reduced_budget": "false",
+	}
+	before := scenarioSearchCounterValue(t, "scenario_search_jobs_total", labels)
+	solver := NewJobsSolver(nil, nil, nil, framework.Reclaim, nil)
+	pendingJob := podgroup_info.NewPodGroupInfo("pending-job")
+
+	_, _, _, result := solver.SolveWithResult(&framework.Session{}, pendingJob)
+
+	require.Equal(t, SearchResultGeneratorsExhausted, result.Reason())
+	require.Equal(t, before+1, scenarioSearchCounterValue(t, "scenario_search_jobs_total", labels))
+}
+
 func TestSolveWithResultReturnsNoGeneratorWhenGeneratorFuncIsNil(t *testing.T) {
 	ssn, pendingJob := newJobSolverResultTestSession(t, 1)
 	solver := NewJobsSolver(nil, nil, nil, framework.Reclaim, nil)
@@ -151,6 +169,65 @@ func TestSolveWithResultReportsDeadlineWhenBudgetExhaustsDuringScenarioSearch(t
 	require.Equal(t, SearchResultDeadlineExhausted, result.Reason())
 }
 
+func TestSolveWithResultRecordsGeneratorExhaustedMetricAfterGeneratorAttempt(t *testing.T) {
+	labels := map[string]string{
+		"action":         "reclaim",
+		"result":         string(SearchResultGeneratorsExhausted),
+		"reduced_budget": "false",
+	}
+	before := scenarioSearchCounterValue(t, "scenario_search_jobs_total", labels)
+	ssn, pendingJob := newJobSolverResultTestSession(t, 1)
+	ssn.AddScenarioGenerator("empty", portfolioTestFactory(&portfolioTestGenerator{name: "empty"}))
+	solver := NewJobsSolver(
+		nil,
+		nil,
+		func() *utils.JobsOrderByQueues {
+			return utils.GetVictimsQueue(ssn, nil)
+		},
+		framework.Reclaim,
+		nil,
+	)
+
+	_, _, _, result := solver.SolveWithResult(ssn, pendingJob)
+
+	require.Equal(t, SearchResultGeneratorsExhausted, result.Reason())
+	require.Equal(t, before+1, scenarioSearchCounterValue(t, "scenario_search_jobs_total", labels))
+}
+
+func TestSolveWithResultRecordsUnsolvedScenarioDurationAfterSimulation(t *testing.T) {
+	generatorName := "test-unsolved-duration"
+	labels := map[string]string{
+		"action":    "reclaim",
+		"generator": generatorName,
+		"result":    scenarioSearchResultUnsolved,
+	}
+	before := scenarioSearchHistogramCount(t, "scenario_search_duration_seconds", labels)
+	ssn, pendingJob := newJobSolverResultTestSession(t, 1)
+	ssn.ClusterInfo.Nodes = map[string]*node_info.NodeInfo{"node-1": {}}
+	scenarioToSolve := scenario.NewByNodeScenario(
+		ssn, pendingJob,
+		podgroup_info.GetTasksToAllocate(pendingJob, ssn.SubGroupOrderFn, ssn.TaskOrderFn, false),
+		nil, nil,
+	)
+	ssn.AddScenarioGenerator(generatorName, portfolioTestFactory(&portfolioTestGenerator{
+		name:      generatorName,
+		scenarios: []api.ScenarioInfo{scenarioToSolve},
+	}))
+	solver := NewJobsSolver(
+		nil,
+		nil,
+		func() *utils.JobsOrderByQueues {
+			return utils.GetVictimsQueue(ssn, nil)
+		},
+		framework.Reclaim,
+		nil,
+	)
+
+	solver.SolveWithResult(ssn, pendingJob)
+
+	require.Equal(t, before+1, scenarioSearchHistogramCount(t, "scenario_search_duration_seconds", labels))
+}
+
 func TestSolveWithResultRunsCompletePartialSearchForOneGeneratorBeforeNext(t *testing.T) {
 	ssn := newGeneratorTestSession(t, map[string]int{
 		"node-1": 1,
@@ -268,3 +345,54 @@ func newJobSolverResultTestSession(t *testing.T, tasksCount int) (*framework.Ses
 		},
 	}, pendingJob
 }
+
+func scenarioSearchCounterValue(t *testing.T, metricName string, labels map[string]string) float64 {
+	t.Helper()
+
+	metric := scenarioSearchMetric(t, metricName, labels)
+	if metric == nil || metric.GetCounter() == nil {
+		return 0
+	}
+	return metric.GetCounter().GetValue()
+}
+
+func scenarioSearchHistogramCount(t *testing.T, metricName string, labels map[string]string) uint64 {
+	t.Helper()
+
+	metric := scenarioSearchMetric(t, metricName, labels)
+	if metric == nil || metric.GetHistogram() == nil {
+		return 0
+	}
+	return metric.GetHistogram().GetSampleCount()
+}
+
+func scenarioSearchMetric(t *testing.T, metricName string, labels map[string]string) *dto.Metric {
+	t.Helper()
+
+	families, err := prometheus.DefaultGatherer.Gather()
+	require.NoError(t, err)
+	for _, family := range families {
+		if family.GetName() != metricName {
+			continue
+		}
+		for _, metric := range family.GetMetric() {
+			if scenarioSearchMetricHasLabels(metric, labels) {
+				return metric
+			}
+		}
+	}
+	return nil
+}
+
+func scenarioSearchMetricHasLabels(metric *dto.Metric, labels map[string]string) bool {
+	if len(metric.GetLabel()) != len(labels) {
+		return false
+	}
+	for _, label := range metric.GetLabel() {
+		expectedValue, found := labels[label.GetName()]
+		if !found || expectedValue != label.GetValue() {
+			return false
+		}
+	}
+	return true
+}