Skip to content

Commit c9f1d75

Browse files
committed
agent: Add per-VM metric for desired CU(s)
This commit adds a new per-VM metric: autoscaling_vm_desired_cu. It's based on the same "desired CU" information exposed by the scaling event reporting, but updated continuously instead of being rate limited to avoid spamming our reporting. The metric has the same base labels as the other per-VM metrics, with the addition of the "reason" label, which is one of: * "total" - the goal CU, after taking the maximum of the individual parts and rounding up to the next unit. * "cpu" - goal CU size in order to fit the current CPU usage * "mem" - goal CU size in order to fit the current memory usage, which includes some assesssment * "lfc" - goal CU size in order to fit the estimated working set size All of these values are also multiplied by the same Compute Unit factor as with the normal scaling event reporting, so that Neon's fractional compute units are exposed as such in the metrics, even as we use integer compute units in the autoscaler-agent. Also note that all values except "total" are NOT rounded, and instead show the fractional amounts to allow better comparison. KNOWN LIMITATION: If ReportDesiredScaling is disabled at runtime for a particular VM, the metrics will not be cleared, and instead will just cease to be updated. I figured this is a reasonable trade-off for simplicity.
1 parent 693b601 commit c9f1d75

File tree

7 files changed

+134
-13
lines changed

7 files changed

+134
-13
lines changed

autoscaler-agent/config_map.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ data:
2929
"scalingEvents": {
3030
"cuMultiplier": 0.25,
3131
"clients": {}
32-
}
32+
},
3333
"monitor": {
3434
"serverPort": 10301,
3535
"responseTimeoutSeconds": 5,

pkg/agent/core/state.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,14 @@ func (s *state) desiredResourcesFromMetricsOrRequestedUpscaling(now time.Time) (
735735
// 3. that's it!
736736

737737
reportGoals := func(goalCU uint32, parts scalingevents.GoalCUComponents) {
738-
panic("todo")
738+
currentCU, ok := s.VM.Using().DivResources(s.Config.ComputeUnit)
739+
if !ok {
740+
return // skip reporting if the current CU is not right.
741+
}
742+
743+
if report := s.Config.ObservabilityCallbacks.DesiredScaling; report != nil {
744+
report(now, uint32(currentCU), goalCU, parts)
745+
}
739746
}
740747

741748
sg, goalCULogFields := calculateGoalCU(
@@ -1232,12 +1239,12 @@ func (s *State) NeonVM() NeonVMHandle {
12321239
}
12331240

12341241
func (h NeonVMHandle) StartingRequest(now time.Time, resources api.Resources) {
1235-
if f := h.s.Config.ObservabilityCallbacks.ScalingEvent; f != nil {
1242+
if report := h.s.Config.ObservabilityCallbacks.ScalingEvent; report != nil {
12361243
currentCU, currentOk := h.s.VM.Using().DivResources(h.s.Config.ComputeUnit)
12371244
targetCU, targetOk := resources.DivResources(h.s.Config.ComputeUnit)
12381245

12391246
if currentOk && targetOk {
1240-
f(now, uint32(currentCU), uint32(targetCU))
1247+
report(now, uint32(currentCU), uint32(targetCU))
12411248
}
12421249
}
12431250

pkg/agent/entrypoint.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,13 @@ func (r MainRunner) Run(logger *zap.Logger, ctx context.Context) error {
5858
return fmt.Errorf("Error creating scaling events reporter: %w", err)
5959
}
6060

61-
globalState, globalPromReg := r.newAgentState(logger, r.EnvArgs.K8sPodIP, schedTracker, scalingReporter)
61+
globalState, globalPromReg := r.newAgentState(
62+
logger,
63+
r.EnvArgs.K8sPodIP,
64+
schedTracker,
65+
perVMMetrics,
66+
scalingReporter,
67+
)
6268
watchMetrics.MustRegister(globalPromReg)
6369

6470
logger.Info("Starting billing metrics collector")

pkg/agent/globalstate.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ type agentState struct {
4141
vmClient *vmclient.Clientset
4242
schedTracker *schedwatch.SchedulerTracker
4343
metrics GlobalMetrics
44+
vmMetrics *PerVMMetrics
4445

4546
scalingReporter *scalingevents.Reporter
4647
}
@@ -49,6 +50,7 @@ func (r MainRunner) newAgentState(
4950
baseLogger *zap.Logger,
5051
podIP string,
5152
schedTracker *schedwatch.SchedulerTracker,
53+
perVMMetrics *PerVMMetrics,
5254
scalingReporter *scalingevents.Reporter,
5355
) (*agentState, *prometheus.Registry) {
5456
metrics, promReg := makeGlobalMetrics()
@@ -63,6 +65,7 @@ func (r MainRunner) newAgentState(
6365
podIP: podIP,
6466
schedTracker: schedTracker,
6567
metrics: metrics,
68+
vmMetrics: perVMMetrics,
6669

6770
scalingReporter: scalingReporter,
6871
}

pkg/agent/prommetrics.go

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
package agent
22

33
import (
4+
"sync"
45
"time"
56

67
"github.com/prometheus/client_golang/prometheus"
78
"github.com/prometheus/client_golang/prometheus/collectors"
9+
"github.com/samber/lo"
810

911
vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
1012
"github.com/neondatabase/autoscaling/pkg/agent/core/revsource"
13+
"github.com/neondatabase/autoscaling/pkg/agent/scalingevents"
1114
"github.com/neondatabase/autoscaling/pkg/util"
1215
)
1316

@@ -342,9 +345,22 @@ func WrapHistogramVec(hist *prometheus.HistogramVec) revsource.ObserveCallback {
342345
}
343346

344347
type PerVMMetrics struct {
348+
// activeMu and activeVMs exist to track the set of VMs currently represented in the metrics, so
349+
// that when we set the desired CU from internal information, we can check whether the VM still
350+
// exists.
351+
// Otherwise it's not possible to prevent data races that would result in leaking metric labels.
352+
activeMu sync.Mutex
353+
activeVMs map[util.NamespacedName]vmMetadata
354+
345355
cpu *prometheus.GaugeVec
346356
memory *prometheus.GaugeVec
347357
restartCount *prometheus.GaugeVec
358+
desiredCU *prometheus.GaugeVec
359+
}
360+
361+
type vmMetadata struct {
362+
endpointID string
363+
projectID string
348364
}
349365

350366
type vmResourceValueType string
@@ -358,10 +374,13 @@ const (
358374
vmResourceValueAutoscalingMax vmResourceValueType = "autoscaling_max"
359375
)
360376

361-
func makePerVMMetrics() (PerVMMetrics, *prometheus.Registry) {
377+
func makePerVMMetrics() (*PerVMMetrics, *prometheus.Registry) {
362378
reg := prometheus.NewRegistry()
363379

364-
metrics := PerVMMetrics{
380+
metrics := &PerVMMetrics{
381+
activeMu: sync.Mutex{},
382+
activeVMs: make(map[util.NamespacedName]vmMetadata),
383+
365384
cpu: util.RegisterMetric(reg, prometheus.NewGaugeVec(
366385
prometheus.GaugeOpts{
367386
Name: "autoscaling_vm_cpu_cores",
@@ -400,6 +419,19 @@ func makePerVMMetrics() (PerVMMetrics, *prometheus.Registry) {
400419
"project_id", // .metadata.labels["neon/project-id"]
401420
},
402421
)),
422+
desiredCU: util.RegisterMetric(reg, prometheus.NewGaugeVec(
423+
prometheus.GaugeOpts{
424+
Name: "autoscaling_vm_desired_cu",
425+
Help: "Amount of Compute Units desired for a VM: the total, and the components for cpu, memory, and LFC",
426+
},
427+
[]string{
428+
"vm_namespace", // .metadata.namespace
429+
"vm_name", // .metadata.name
430+
"endpoint_id", // .metadata.labels["neon/endpoint-id"]
431+
"project_id", // .metadata.labels["neon/project-id"]
432+
"reason", // desiredCUReason: total, cpu, mem, lfc
433+
},
434+
)),
403435
}
404436

405437
return metrics, reg
@@ -424,3 +456,46 @@ type vmMetric struct {
424456
labels prometheus.Labels
425457
value float64
426458
}
459+
460+
func (m *PerVMMetrics) updateDesiredCU(
461+
vm util.NamespacedName,
462+
conversionFactor float64,
463+
total uint32,
464+
parts scalingevents.GoalCUComponents,
465+
) {
466+
m.activeMu.Lock()
467+
defer m.activeMu.Unlock()
468+
469+
// Don't do anything if this VM is not known. Either the relevant watch event hasn't been
470+
// processed yet (unlikely, maybe impossible?) or it has since been deleted (in which case we
471+
// don't want to leak metrics that won't get cleaned up)
472+
info, ok := m.activeVMs[vm]
473+
if !ok {
474+
return
475+
}
476+
477+
pairs := []struct {
478+
key string
479+
value *float64
480+
}{
481+
{"total", lo.ToPtr(float64(total))},
482+
{"cpu", parts.CPU},
483+
{"mem", parts.Mem},
484+
{"lfc", parts.LFC},
485+
}
486+
487+
for _, p := range pairs {
488+
labels := prometheus.Labels{
489+
"vm_namespace": vm.Namespace,
490+
"vm_name": vm.Name,
491+
"endpoint_id": info.endpointID,
492+
"project_id": info.projectID,
493+
"reason": p.key,
494+
}
495+
if p.value == nil {
496+
m.desiredCU.Delete(labels)
497+
} else {
498+
m.desiredCU.With(labels).Set(*p.value * conversionFactor /* multiply to allow fractional CU in metrics */)
499+
}
500+
}
501+
}

pkg/agent/runner.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -376,8 +376,12 @@ func (r *Runner) reportDesiredScaling(
376376
return
377377
}
378378

379-
// TODO: Use this opportunity to report the desired scaling in the per-VM
380-
// metrics.
379+
r.global.vmMetrics.updateDesiredCU(
380+
r.vmName,
381+
r.global.config.ScalingEvents.CUMultiplier, // have to multiply before exposing as metrics here.
382+
targetCU,
383+
parts,
384+
)
381385

382386
rl.report(r.global.scalingReporter, timestamp, endpointID, currentCU, targetCU, parts)
383387
}

pkg/agent/watch.go

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func startVMWatcher(
6464
config *Config,
6565
vmClient *vmclient.Clientset,
6666
metrics watch.Metrics,
67-
perVMMetrics PerVMMetrics,
67+
perVMMetrics *PerVMMetrics,
6868
nodeName string,
6969
submitEvent func(vmEvent),
7070
) (*watch.Store[vmapi.VirtualMachine], error) {
@@ -91,7 +91,7 @@ func startVMWatcher(
9191
metav1.ListOptions{},
9292
watch.HandlerFuncs[*vmapi.VirtualMachine]{
9393
AddFunc: func(vm *vmapi.VirtualMachine, preexisting bool) {
94-
setVMMetrics(&perVMMetrics, vm, nodeName)
94+
setVMMetrics(perVMMetrics, vm, nodeName)
9595

9696
if vmIsOurResponsibility(vm, config, nodeName) {
9797
event, err := makeVMEvent(logger, vm, vmEventAdded)
@@ -106,7 +106,7 @@ func startVMWatcher(
106106
}
107107
},
108108
UpdateFunc: func(oldVM, newVM *vmapi.VirtualMachine) {
109-
updateVMMetrics(&perVMMetrics, oldVM, newVM, nodeName)
109+
updateVMMetrics(perVMMetrics, oldVM, newVM, nodeName)
110110

111111
oldIsOurs := vmIsOurResponsibility(oldVM, config, nodeName)
112112
newIsOurs := vmIsOurResponsibility(newVM, config, nodeName)
@@ -140,7 +140,7 @@ func startVMWatcher(
140140
submitEvent(event)
141141
},
142142
DeleteFunc: func(vm *vmapi.VirtualMachine, maybeStale bool) {
143-
deleteVMMetrics(&perVMMetrics, vm, nodeName)
143+
deleteVMMetrics(perVMMetrics, vm, nodeName)
144144

145145
if vmIsOurResponsibility(vm, config, nodeName) {
146146
event, err := makeVMEvent(logger, vm, vmEventDeleted)
@@ -319,6 +319,14 @@ func setVMMetrics(perVMMetrics *PerVMMetrics, vm *vmapi.VirtualMachine, nodeName
319319
for _, m := range restartCountMetrics {
320320
perVMMetrics.restartCount.With(m.labels).Set(m.value)
321321
}
322+
323+
// Add the VM to the internal tracker:
324+
perVMMetrics.activeMu.Lock()
325+
defer perVMMetrics.activeMu.Unlock()
326+
perVMMetrics.activeVMs[util.GetNamespacedName(vm)] = vmMetadata{
327+
endpointID: vm.Labels[endpointLabel],
328+
projectID: vm.Labels[projectLabel],
329+
}
322330
}
323331

324332
func updateVMMetrics(perVMMetrics *PerVMMetrics, oldVM, newVM *vmapi.VirtualMachine, nodeName string) {
@@ -357,6 +365,14 @@ func updateVMMetrics(perVMMetrics *PerVMMetrics, oldVM, newVM *vmapi.VirtualMach
357365
oldRestartCountMetrics := makeVMRestartMetrics(oldVM)
358366
newRestartCountMetrics := makeVMRestartMetrics(newVM)
359367
updateMetrics(perVMMetrics.restartCount, oldRestartCountMetrics, newRestartCountMetrics)
368+
369+
// Update the VM in the internal tracker:
370+
perVMMetrics.activeMu.Lock()
371+
defer perVMMetrics.activeMu.Unlock()
372+
perVMMetrics.activeVMs[util.GetNamespacedName(newVM /* name can't change */)] = vmMetadata{
373+
endpointID: newVM.Labels[endpointLabel],
374+
projectID: newVM.Labels[projectLabel],
375+
}
360376
}
361377

362378
func deleteVMMetrics(perVMMetrics *PerVMMetrics, vm *vmapi.VirtualMachine, nodeName string) {
@@ -378,4 +394,14 @@ func deleteVMMetrics(perVMMetrics *PerVMMetrics, vm *vmapi.VirtualMachine, nodeN
378394
for _, m := range restartCountMetrics {
379395
perVMMetrics.restartCount.Delete(m.labels)
380396
}
397+
398+
// Remove the VM from the internal tracker:
399+
perVMMetrics.activeMu.Lock()
400+
defer perVMMetrics.activeMu.Unlock()
401+
delete(perVMMetrics.activeVMs, util.GetNamespacedName(vm))
402+
// ... and any metrics that were associated with it:
403+
perVMMetrics.desiredCU.DeletePartialMatch(prometheus.Labels{
404+
"vm_namespace": vm.Namespace,
405+
"vm_name": vm.Name,
406+
})
381407
}

0 commit comments

Comments
 (0)