diff --git a/internal/constants/metrics.go b/internal/constants/metrics.go index 60a32e703..d7b1ca88c 100644 --- a/internal/constants/metrics.go +++ b/internal/constants/metrics.go @@ -108,6 +108,13 @@ const ( // WVADesiredRatio is a gauge that tracks the ratio of desired to current replicas. // Labels: variant_name, namespace, accelerator_type WVADesiredRatio = "wva_desired_ratio" + + // WVAOptimizationDurationSeconds is a histogram that tracks the duration of each optimization cycle. + // Labels: status (success, error) + WVAOptimizationDurationSeconds = "wva_optimization_duration_seconds" + + // WVAModelsProcessedTotal is a gauge that tracks the number of models processed in the last optimization cycle. + WVAModelsProcessedTotal = "wva_models_processed_total" ) // Metric Label Names @@ -120,4 +127,5 @@ const ( LabelReason = "reason" LabelAcceleratorType = "accelerator_type" LabelControllerInstance = "controller_instance" + LabelStatus = "status" ) diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go index 7e60a9d6d..042d35c23 100644 --- a/internal/engines/saturation/engine.go +++ b/internal/engines/saturation/engine.go @@ -43,6 +43,7 @@ import ( "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/engines/pipeline" "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/interfaces" "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/logging" + "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/metrics" "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/saturation" "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/utils" "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/utils/scaletarget" @@ -180,7 +181,20 @@ func (e *Engine) StartOptimizeLoop(ctx context.Context) { } // optimize performs the optimization logic. -func (e *Engine) optimize(ctx context.Context) error { +func (e *Engine) optimize(ctx context.Context) (retErr error) { + start := time.Now() + var modelsProcessed int + defer func() { + status := "success" + if retErr != nil { + status = "error" + } + metrics.ObserveOptimizationDuration(time.Since(start).Seconds(), status) + if modelsProcessed > 0 { + metrics.SetModelsProcessed(modelsProcessed) + } + }() + logger := ctrl.LoggerFrom(ctx) // Get optimization interval from Config (already a time.Duration) @@ -225,6 +239,7 @@ func (e *Engine) optimize(ctx context.Context) error { // Group VAs by model for per-model capacity analysis modelGroups := utils.GroupVariantAutoscalingByModel(activeVAs) + modelsProcessed = len(modelGroups) logger.Info("Grouped VAs by model", "modelCount", len(modelGroups), "totalVAs", len(activeVAs)) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 93cb2f694..db46c7f7b 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -19,6 +19,9 @@ var ( currentReplicas *prometheus.GaugeVec desiredRatio *prometheus.GaugeVec + optimizationDuration *prometheus.HistogramVec + modelsProcessedGauge prometheus.Gauge + // controllerInstance stores the optional controller instance identifier. // When set, it's added as a label to all emitted metrics. controllerInstance string @@ -76,6 +79,25 @@ func InitMetrics(registry prometheus.Registerer) error { baseLabels, ) + optimizationDurationLabels := []string{constants.LabelStatus} + if controllerInstance != "" { + optimizationDurationLabels = append(optimizationDurationLabels, constants.LabelControllerInstance) + } + optimizationDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: constants.WVAOptimizationDurationSeconds, + Help: "Duration of optimization loop cycles in seconds", + Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, + }, + optimizationDurationLabels, + ) + modelsProcessedGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: constants.WVAModelsProcessedTotal, + Help: "Number of models processed in the last optimization cycle", + }, + ) + // Register metrics with the registry if err := registry.Register(replicaScalingTotal); err != nil { return fmt.Errorf("failed to register replicaScalingTotal metric: %w", err) @@ -89,6 +111,12 @@ func InitMetrics(registry prometheus.Registerer) error { if err := registry.Register(desiredRatio); err != nil { return fmt.Errorf("failed to register desiredRatio metric: %w", err) } + if err := registry.Register(optimizationDuration); err != nil { + return fmt.Errorf("failed to register optimizationDuration metric: %w", err) + } + if err := registry.Register(modelsProcessedGauge); err != nil { + return fmt.Errorf("failed to register modelsProcessedGauge metric: %w", err) + } return nil } @@ -133,6 +161,27 @@ func (m *MetricsEmitter) EmitReplicaScalingMetrics(ctx context.Context, va *llmd return nil } +// ObserveOptimizationDuration records the duration of an optimization cycle with the given status. +// Status should be one of: "success", "error". +func ObserveOptimizationDuration(durationSeconds float64, status string) { + if optimizationDuration == nil { + return + } + labels := prometheus.Labels{constants.LabelStatus: status} + if controllerInstance != "" { + labels[constants.LabelControllerInstance] = controllerInstance + } + optimizationDuration.With(labels).Observe(durationSeconds) +} + +// SetModelsProcessed sets the gauge to the number of models processed in the last optimization cycle. +func SetModelsProcessed(count int) { + if modelsProcessedGauge == nil { + return + } + modelsProcessedGauge.Set(float64(count)) +} + // EmitReplicaMetrics emits current and desired replica metrics func (m *MetricsEmitter) EmitReplicaMetrics(ctx context.Context, va *llmdOptv1alpha1.VariantAutoscaling, current, desired int32, acceleratorType string) error { baseLabels := prometheus.Labels{ diff --git a/internal/metrics/optimization_metrics_test.go b/internal/metrics/optimization_metrics_test.go new file mode 100644 index 000000000..efd4908c6 --- /dev/null +++ b/internal/metrics/optimization_metrics_test.go @@ -0,0 +1,144 @@ +/* +Copyright 2025 The llm-d Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + + "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/constants" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" +) + +func TestObserveOptimizationDuration(t *testing.T) { + registry := prometheus.NewRegistry() + if err := InitMetrics(registry); err != nil { + t.Fatalf("InitMetrics failed: %v", err) + } + + // Observe a successful optimization + ObserveOptimizationDuration(0.15, "success") + + // Observe a failed optimization + ObserveOptimizationDuration(2.5, "error") + + // Verify the histogram was recorded + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + var found bool + for _, mf := range metrics { + if mf.GetName() == constants.WVAOptimizationDurationSeconds { + found = true + // Should have 2 metrics (one per status label) + if len(mf.GetMetric()) != 2 { + t.Errorf("Expected 2 metric series, got %d", len(mf.GetMetric())) + } + for _, m := range mf.GetMetric() { + h := m.GetHistogram() + if h == nil { + t.Error("Expected histogram metric") + continue + } + if h.GetSampleCount() != 1 { + t.Errorf("Expected 1 sample per status, got %d", h.GetSampleCount()) + } + // Check status label + status := getLabelValue(m, constants.LabelStatus) + switch status { + case "success": + if h.GetSampleSum() < 0.1 || h.GetSampleSum() > 0.2 { + t.Errorf("Expected success duration ~0.15, got %f", h.GetSampleSum()) + } + case "error": + if h.GetSampleSum() < 2.0 || h.GetSampleSum() > 3.0 { + t.Errorf("Expected error duration ~2.5, got %f", h.GetSampleSum()) + } + default: + t.Errorf("Unexpected status label: %s", status) + } + } + } + } + if !found { + t.Errorf("Metric %s not found in gathered metrics", constants.WVAOptimizationDurationSeconds) + } +} + +func TestSetModelsProcessed(t *testing.T) { + registry := prometheus.NewRegistry() + if err := InitMetrics(registry); err != nil { + t.Fatalf("InitMetrics failed: %v", err) + } + + // Set models processed (gauge should reflect the last value, not a sum) + SetModelsProcessed(3) + SetModelsProcessed(5) + + // Verify the gauge + metrics, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + var found bool + for _, mf := range metrics { + if mf.GetName() == constants.WVAModelsProcessedTotal { + found = true + if len(mf.GetMetric()) != 1 { + t.Errorf("Expected 1 metric series, got %d", len(mf.GetMetric())) + } + g := mf.GetMetric()[0].GetGauge() + if g == nil { + t.Error("Expected gauge metric") + } else if g.GetValue() != 5 { + t.Errorf("Expected gauge value 5 (last set), got %f", g.GetValue()) + } + } + } + if !found { + t.Errorf("Metric %s not found in gathered metrics", constants.WVAModelsProcessedTotal) + } +} + +func TestOptimizationMetrics_NilSafety(t *testing.T) { + // Reset the package-level vars to nil to simulate uninitialized state + savedDuration := optimizationDuration + savedGauge := modelsProcessedGauge + optimizationDuration = nil + modelsProcessedGauge = nil + defer func() { + optimizationDuration = savedDuration + modelsProcessedGauge = savedGauge + }() + + // Should not panic when metrics are not initialized + ObserveOptimizationDuration(1.0, "success") + SetModelsProcessed(5) +} + +// getLabelValue returns the value of a label by name from a metric. +func getLabelValue(m *dto.Metric, name string) string { + for _, l := range m.GetLabel() { + if l.GetName() == name { + return l.GetValue() + } + } + return "" +}