Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions internal/constants/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ const (
// WVADesiredRatio is a gauge that tracks the ratio of desired to current replicas.
// Labels: variant_name, namespace, accelerator_type
WVADesiredRatio = "wva_desired_ratio"

// WVAOptimizationDurationSeconds is a histogram that tracks the duration of each optimization cycle.
// Labels: status (success, error, partial)
WVAOptimizationDurationSeconds = "wva_optimization_duration_seconds"

// WVAModelsProcessedTotal is a counter that tracks the total number of models processed across optimization cycles.
WVAModelsProcessedTotal = "wva_models_processed_total"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A monotonic counter of "total models processed" requires rate() to be useful, which gives models/sec — not a very meaningful signal for this use case. A gauge ("models in last cycle") would be more directly dashboardable and useful for alerting (e.g., "model count dropped to 0"). Worth considering whether this counter will actually drive alerts or dashboards as-is.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense — changed to a gauge. SetModelsProcessed(n) now reflects the last cycle directly, no rate() needed.

)

// Metric Label Names
Expand All @@ -120,4 +127,5 @@ const (
LabelReason = "reason"
LabelAcceleratorType = "accelerator_type"
LabelControllerInstance = "controller_instance"
LabelStatus = "status"
)
25 changes: 25 additions & 0 deletions internal/engines/saturation/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/engines/pipeline"
"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/interfaces"
"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/logging"
"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/metrics"
"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/saturation"
"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/utils"
"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/utils/scaletarget"
Expand Down Expand Up @@ -102,6 +103,9 @@ type Engine struct {
// AnalyzerResults. Selected per-cycle based on enableLimiter config:
// CostAwareOptimizer (unlimited) or GreedyByScoreOptimizer (limited).
optimizer pipeline.ScalingOptimizer

// metricsEmitter emits optimization loop performance metrics (duration, models processed).
metricsEmitter *metrics.MetricsEmitter
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Engine struct didn't previously have a metricsEmitter — the existing MetricsEmitter is used downstream in applySaturationDecisions. Adding a second instance creates two separate entry points for metric emission. Since MetricsEmitter is a stateless empty struct backed by package-level vars, this works but is a bit confusing. Consider either:

  • Package-level helper functions (no emitter instance needed), or
  • Receiving the emitter as a NewEngine parameter so there's a single instance

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

switched to package-level functions metrics.ObserveOptimizationDuration() and metrics.SetModelsProcessed().

}

// NewEngine creates a new instance of the saturation engine.
Expand Down Expand Up @@ -144,6 +148,7 @@ func NewEngine(client client.Client, scheme *runtime.Scheme, recorder record.Eve
queueingModelAnalyzer: queueingmodel.NewQueueingModelAnalyzer(),
capacityStore: capacityStore,
optimizer: scalingOptimizer,
metricsEmitter: metrics.NewMetricsEmitter(),
}

engine.executor = executor.NewPollingExecutor(executor.PollingConfig{
Expand Down Expand Up @@ -181,6 +186,21 @@ func (e *Engine) StartOptimizeLoop(ctx context.Context) {

// optimize performs the optimization logic.
func (e *Engine) optimize(ctx context.Context) error {
start := time.Now()
var optimizeErr error
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The manual optimizeErr = err pattern before each return err (lines 226, 241, 329) is fragile — any new return path added later will silently miss the metric. The idiomatic Go approach uses named return values which automatically captures all paths:

func (e *Engine) optimize(ctx context.Context) (retErr error) {
    start := time.Now()
    var modelsProcessed int
    defer func() {
        status := "success"
        if retErr != nil {
            status = "error"
        }
        e.metricsEmitter.ObserveOptimizationDuration(time.Since(start).Seconds(), status)
        if modelsProcessed > 0 {
            e.metricsEmitter.IncrModelsProcessed(modelsProcessed)
        }
    }()
    // ... existing code unchanged, every "return err" is captured via retErr
}

This eliminates all three optimizeErr = err assignments

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call — switched to named return (retErr error) and removed all the manual optimizeErr = err assignments

var modelsProcessed int
defer func() {
duration := time.Since(start).Seconds()
status := "success"
if optimizeErr != nil {
status = "error"
}
e.metricsEmitter.ObserveOptimizationDuration(duration, status)
if modelsProcessed > 0 {
e.metricsEmitter.IncrModelsProcessed(modelsProcessed)
}
}()

logger := ctrl.LoggerFrom(ctx)

// Get optimization interval from Config (already a time.Duration)
Expand All @@ -203,6 +223,7 @@ func (e *Engine) optimize(ctx context.Context) error {
activeVAs, _, err := utils.ActiveVariantAutoscaling(ctx, e.client)
if err != nil {
logger.Error(err, "Unable to get active variant autoscalings")
optimizeErr = err
return err
}

Expand All @@ -217,6 +238,7 @@ func (e *Engine) optimize(ctx context.Context) error {
if err != nil {
logger.Error(err, "Failed to collect cluster inventory")
// do not proceed to optimization if inventory collection fails in limited mode
optimizeErr = err
return err
}
// always print inventory until optimizer consumes it
Expand Down Expand Up @@ -304,9 +326,12 @@ func (e *Engine) optimize(ctx context.Context) error {
}
if err := e.applySaturationDecisions(ctx, allDecisions, vaMap, currentAllocations); err != nil {
logger.Error(err, "Failed to apply saturation decisions")
optimizeErr = err
return err
}

modelsProcessed = len(modelGroups)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

modelsProcessed is set after applySaturationDecisions succeeds, but modelGroups is computed much earlier (line 228). This means a failure in applySaturationDecisions reports modelsProcessed=0 even though models were analyzed — only the apply step failed. Consider moving this assignment to right after modelGroups is computed, so the metric accurately reflects how many models entered the optimization pipeline regardless of apply outcome.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved the assignment to right after modelGroups is computed


logger.Info("Optimization completed successfully",
"mode", "saturation-only",
"modelsProcessed", len(modelGroups),
Expand Down
41 changes: 41 additions & 0 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ var (
currentReplicas *prometheus.GaugeVec
desiredRatio *prometheus.GaugeVec

optimizationDuration *prometheus.HistogramVec
modelsProcessedTotal prometheus.Counter

// controllerInstance stores the optional controller instance identifier.
// When set, it's added as a label to all emitted metrics.
controllerInstance string
Expand Down Expand Up @@ -76,6 +79,21 @@ func InitMetrics(registry prometheus.Registerer) error {
baseLabels,
)

optimizationDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: constants.WVAOptimizationDurationSeconds,
Help: "Duration of optimization loop cycles in seconds",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
},
[]string{constants.LabelStatus},
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All existing WVA metrics conditionally add a controller_instance label when the CONTROLLER_INSTANCE env var is set (see InitMetrics lines 45-48 where baseLabels and scalingLabels get the extra label appended). The new optimizationDuration histogram only has LabelStatus and modelsProcessedTotal has no labels at all. In HA deployments with multiple controller instances, these metrics will collide.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, missed this pattern. Added controller_instance conditionally to the histogram labels, same as the existing metrics. The gauge (modelsProcessedGauge) is a singleton (no labels) so it won't collide in HA — it
reflects the local controller's last cycle.

)
modelsProcessedTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Name: constants.WVAModelsProcessedTotal,
Help: "Total number of models processed across optimization cycles",
},
)

// Register metrics with the registry
if err := registry.Register(replicaScalingTotal); err != nil {
return fmt.Errorf("failed to register replicaScalingTotal metric: %w", err)
Expand All @@ -89,6 +107,12 @@ func InitMetrics(registry prometheus.Registerer) error {
if err := registry.Register(desiredRatio); err != nil {
return fmt.Errorf("failed to register desiredRatio metric: %w", err)
}
if err := registry.Register(optimizationDuration); err != nil {
return fmt.Errorf("failed to register optimizationDuration metric: %w", err)
}
if err := registry.Register(modelsProcessedTotal); err != nil {
return fmt.Errorf("failed to register modelsProcessedTotal metric: %w", err)
}

return nil
}
Expand Down Expand Up @@ -133,6 +157,23 @@ func (m *MetricsEmitter) EmitReplicaScalingMetrics(ctx context.Context, va *llmd
return nil
}

// ObserveOptimizationDuration records the duration of an optimization cycle with the given status.
// Status should be one of: "success", "error", "partial".
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment documents partial as a valid status, but the engine code only ever emits success or error. Either remove partial from the comment until it's implemented, or add a note that it's reserved for future use — otherwise someone might grep for "partial" and be confused when it's never emitted.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed partial from the comment. Can be added later when a partial completion path actually exists

func (m *MetricsEmitter) ObserveOptimizationDuration(durationSeconds float64, status string) {
if optimizationDuration == nil {
return
}
optimizationDuration.With(prometheus.Labels{constants.LabelStatus: status}).Observe(durationSeconds)
}

// IncrModelsProcessed increments the models-processed counter by the given count.
func (m *MetricsEmitter) IncrModelsProcessed(count int) {
if modelsProcessedTotal == nil {
return
}
modelsProcessedTotal.Add(float64(count))
}

// EmitReplicaMetrics emits current and desired replica metrics
func (m *MetricsEmitter) EmitReplicaMetrics(ctx context.Context, va *llmdOptv1alpha1.VariantAutoscaling, current, desired int32, acceleratorType string) error {
baseLabels := prometheus.Labels{
Expand Down
148 changes: 148 additions & 0 deletions internal/metrics/optimization_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
Copyright 2025 The llm-d Authors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"testing"

"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/constants"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
)

func TestObserveOptimizationDuration(t *testing.T) {
registry := prometheus.NewRegistry()
if err := InitMetrics(registry); err != nil {
t.Fatalf("InitMetrics failed: %v", err)
}
emitter := NewMetricsEmitter()

// Observe a successful optimization
emitter.ObserveOptimizationDuration(0.15, "success")

// Observe a failed optimization
emitter.ObserveOptimizationDuration(2.5, "error")

// Verify the histogram was recorded
metrics, err := registry.Gather()
if err != nil {
t.Fatalf("Failed to gather metrics: %v", err)
}

var found bool
for _, mf := range metrics {
if mf.GetName() == constants.WVAOptimizationDurationSeconds {
found = true
// Should have 2 metrics (one per status label)
if len(mf.GetMetric()) != 2 {
t.Errorf("Expected 2 metric series, got %d", len(mf.GetMetric()))
}
for _, m := range mf.GetMetric() {
h := m.GetHistogram()
if h == nil {
t.Error("Expected histogram metric")
continue
}
if h.GetSampleCount() != 1 {
t.Errorf("Expected 1 sample per status, got %d", h.GetSampleCount())
}
// Check status label
status := getLabelValue(m, constants.LabelStatus)
switch status {
case "success":
if h.GetSampleSum() < 0.1 || h.GetSampleSum() > 0.2 {
t.Errorf("Expected success duration ~0.15, got %f", h.GetSampleSum())
}
case "error":
if h.GetSampleSum() < 2.0 || h.GetSampleSum() > 3.0 {
t.Errorf("Expected error duration ~2.5, got %f", h.GetSampleSum())
}
default:
t.Errorf("Unexpected status label: %s", status)
}
}
}
}
if !found {
t.Errorf("Metric %s not found in gathered metrics", constants.WVAOptimizationDurationSeconds)
}
}

func TestIncrModelsProcessed(t *testing.T) {
registry := prometheus.NewRegistry()
if err := InitMetrics(registry); err != nil {
t.Fatalf("InitMetrics failed: %v", err)
}
emitter := NewMetricsEmitter()

// Increment models processed
emitter.IncrModelsProcessed(3)
emitter.IncrModelsProcessed(5)

// Verify the counter
metrics, err := registry.Gather()
if err != nil {
t.Fatalf("Failed to gather metrics: %v", err)
}

var found bool
for _, mf := range metrics {
if mf.GetName() == constants.WVAModelsProcessedTotal {
found = true
if len(mf.GetMetric()) != 1 {
t.Errorf("Expected 1 metric series, got %d", len(mf.GetMetric()))
}
c := mf.GetMetric()[0].GetCounter()
if c == nil {
t.Error("Expected counter metric")
} else if c.GetValue() != 8 {
t.Errorf("Expected counter value 8 (3+5), got %f", c.GetValue())
}
}
}
if !found {
t.Errorf("Metric %s not found in gathered metrics", constants.WVAModelsProcessedTotal)
}
}

func TestObserveOptimizationDuration_NilSafety(t *testing.T) {
// Reset the package-level vars to nil to simulate uninitialized state
savedDuration := optimizationDuration
savedCounter := modelsProcessedTotal
optimizationDuration = nil
modelsProcessedTotal = nil
defer func() {
optimizationDuration = savedDuration
modelsProcessedTotal = savedCounter
}()

emitter := NewMetricsEmitter()

// Should not panic when metrics are not initialized
emitter.ObserveOptimizationDuration(1.0, "success")
emitter.IncrModelsProcessed(5)
}

// getLabelValue returns the value of a label by name from a metric.
func getLabelValue(m *dto.Metric, name string) string {
for _, l := range m.GetLabel() {
if l.GetName() == name {
return l.GetValue()
}
}
return ""
}
Loading