From 2b553bf8fdac43770cf2bb64dc7cc51bcef911f3 Mon Sep 17 00:00:00 2001 From: "Jason W. Ehrlich" Date: Fri, 17 Apr 2026 13:49:25 -0400 Subject: [PATCH 1/4] limatype,limayaml: add MemoryBalloon config and validation Add the MemoryBalloon configuration struct to LimaYAML with four user-facing fields: Enabled, Min, IdleTarget, and Cooldown. Add validation ensuring min < idleTarget <= memory, cooldown is a valid duration, and all size fields parse correctly. Include comprehensive test coverage for valid and invalid configurations. Add memory ballooning documentation to the VZ page. Signed-off-by: Jason W. Ehrlich --- pkg/limatype/lima_yaml.go | 20 +- pkg/limayaml/defaults_test.go | 20 ++ pkg/limayaml/validate.go | 76 +++++- pkg/limayaml/validate_test.go | 247 ++++++++++++++++++++ website/content/en/docs/config/vmtype/vz.md | 33 +++ 5 files changed, 392 insertions(+), 4 deletions(-) diff --git a/pkg/limatype/lima_yaml.go b/pkg/limatype/lima_yaml.go index afb012e8c10..61b947ab55d 100644 --- a/pkg/limatype/lima_yaml.go +++ b/pkg/limatype/lima_yaml.go @@ -121,8 +121,24 @@ type QEMUOpts struct { } type VZOpts struct { - Rosetta Rosetta `yaml:"rosetta,omitempty" json:"rosetta,omitempty"` - DiskImageFormat *image.Type `yaml:"diskImageFormat,omitempty" json:"diskImageFormat,omitempty" jsonschema:"nullable"` + Rosetta Rosetta `yaml:"rosetta,omitempty" json:"rosetta,omitempty"` + DiskImageFormat *image.Type `yaml:"diskImageFormat,omitempty" json:"diskImageFormat,omitempty" jsonschema:"nullable"` + MemoryBalloon MemoryBalloon `yaml:"memoryBalloon,omitempty" json:"memoryBalloon,omitempty"` +} + +// MemoryBalloon configures dynamic memory ballooning for the VZ backend. +// When enabled, the balloon controller automatically shrinks guest memory +// when idle and grows it under pressure, returning unused RAM to the host. +// All fields are pointers to distinguish "not specified" (nil) from explicit values. +type MemoryBalloon struct { + // Enabled enables/disables memory ballooning. + Enabled *bool `yaml:"enabled,omitempty" json:"enabled,omitempty" jsonschema:"nullable"` + // Min is the minimum guest memory size (e.g., "3GiB"). The balloon will never shrink below this. + Min *string `yaml:"min,omitempty" json:"min,omitempty" jsonschema:"nullable"` + // IdleTarget is the target memory when the VM is idle (e.g., "4GiB"). Must be > Min and <= Memory. + IdleTarget *string `yaml:"idleTarget,omitempty" json:"idleTarget,omitempty" jsonschema:"nullable"` + // Cooldown is the minimum time between balloon actions (e.g., "30s"). + Cooldown *string `yaml:"cooldown,omitempty" json:"cooldown,omitempty" jsonschema:"nullable"` } type Rosetta struct { diff --git a/pkg/limayaml/defaults_test.go b/pkg/limayaml/defaults_test.go index 65f0e68e6f8..9ef943e908d 100644 --- a/pkg/limayaml/defaults_test.go +++ b/pkg/limayaml/defaults_test.go @@ -721,6 +721,26 @@ func TestFillDefault(t *testing.T) { assert.DeepEqual(t, &y, &expect, opts...) } +func TestMemoryBalloonStruct(t *testing.T) { + // Verify that MemoryBalloon struct exists on VZOpts and has all expected fields. + balloon := limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("3GiB"), + IdleTarget: ptr.Of("4GiB"), + Cooldown: ptr.Of("30s"), + } + assert.Equal(t, *balloon.Enabled, true) + assert.Equal(t, *balloon.Min, "3GiB") + assert.Equal(t, *balloon.IdleTarget, "4GiB") + assert.Equal(t, *balloon.Cooldown, "30s") + + // Verify MemoryBalloon is a field on VZOpts. + vzOpts := limatype.VZOpts{ + MemoryBalloon: balloon, + } + assert.Equal(t, *vzOpts.MemoryBalloon.Enabled, true) +} + func TestContainerdDefault(t *testing.T) { archives := defaultContainerdArchives() assert.Assert(t, len(archives) > 0) diff --git a/pkg/limayaml/validate.go b/pkg/limayaml/validate.go index 1fb1c2495ae..d3a8c12d7e9 100644 --- a/pkg/limayaml/validate.go +++ b/pkg/limayaml/validate.go @@ -16,6 +16,7 @@ import ( "slices" "strconv" "strings" + "time" "unicode" "github.com/docker/go-units" @@ -90,11 +91,15 @@ func Validate(y *limatype.LimaYAML, warn bool) error { errs = errors.Join(errs, errors.New("field `cpus` must be set")) } - if _, err := units.RAMInBytes(*y.Memory); err != nil { + if y.Memory == nil { + errs = errors.Join(errs, errors.New("field `memory` must be set")) + } else if _, err := units.RAMInBytes(*y.Memory); err != nil { errs = errors.Join(errs, fmt.Errorf("field `memory` has an invalid value: %w", err)) } - if _, err := units.RAMInBytes(*y.Disk); err != nil { + if y.Disk == nil { + errs = errors.Join(errs, errors.New("field `disk` must be set")) + } else if _, err := units.RAMInBytes(*y.Disk); err != nil { errs = errors.Join(errs, fmt.Errorf("field `disk` has an invalid value: %w", err)) } @@ -417,6 +422,73 @@ func Validate(y *limatype.LimaYAML, warn bool) error { } } + errs = errors.Join(errs, validateMemoryBalloon(y)) + + return errs +} + +func validateMemoryBalloon(y *limatype.LimaYAML) error { + if y.VMOpts == nil { + return nil + } + var vzOpts limatype.VZOpts + if err := Convert(y.VMOpts[limatype.VZ], &vzOpts, "vmOpts.vz"); err != nil { + return nil // No VZ opts to validate. + } + balloon := vzOpts.MemoryBalloon + + // If balloon is not enabled, skip all validation. + if balloon.Enabled == nil || !*balloon.Enabled { + return nil + } + + var errs error + const field = "vmOpts.vz.memoryBalloon" + + // Rule 1: balloon requires vmType "vz". + if y.VMType != nil && *y.VMType != limatype.VZ { + errs = errors.Join(errs, fmt.Errorf("field `%s` requires vmType %q, got %q", field, limatype.VZ, *y.VMType)) + } + + // Parse min and idleTarget for comparison. + var minBytes, idleTargetBytes int64 + if balloon.Min != nil { + var err error + minBytes, err = units.RAMInBytes(*balloon.Min) + if err != nil { + errs = errors.Join(errs, fmt.Errorf("field `%s.min` must be a valid byte size: %w", field, err)) + } else if minBytes <= 0 { + errs = errors.Join(errs, fmt.Errorf("field `%s.min` must be greater than 0", field)) + } + } + if balloon.IdleTarget != nil { + var err error + idleTargetBytes, err = units.RAMInBytes(*balloon.IdleTarget) + if err != nil { + errs = errors.Join(errs, fmt.Errorf("field `%s.idleTarget` must be a valid byte size: %w", field, err)) + } + } + + // Rule 2: min < idleTarget. + if minBytes > 0 && idleTargetBytes > 0 && minBytes >= idleTargetBytes { + errs = errors.Join(errs, fmt.Errorf("field `%s.min` must be less than `idleTarget`", field)) + } + + // Rule 2b: idleTarget must not exceed VM memory. + if y.Memory != nil && idleTargetBytes > 0 { + memoryBytes, memErr := units.RAMInBytes(*y.Memory) + if memErr == nil && idleTargetBytes > memoryBytes { + errs = errors.Join(errs, fmt.Errorf("field `%s.idleTarget` must not exceed `memory` (%s)", field, *y.Memory)) + } + } + + // Rule 3: durations must be parseable. + if balloon.Cooldown != nil { + if _, err := time.ParseDuration(*balloon.Cooldown); err != nil { + errs = errors.Join(errs, fmt.Errorf("field `%s.cooldown` must be a valid duration: %w", field, err)) + } + } + return errs } diff --git a/pkg/limayaml/validate_test.go b/pkg/limayaml/validate_test.go index a755b27f065..f74771324ce 100644 --- a/pkg/limayaml/validate_test.go +++ b/pkg/limayaml/validate_test.go @@ -10,6 +10,7 @@ import ( "gotest.tools/v3/assert" "github.com/lima-vm/lima/v2/pkg/limatype" + "github.com/lima-vm/lima/v2/pkg/ptr" "github.com/lima-vm/lima/v2/pkg/version" ) @@ -429,3 +430,249 @@ func TestValidateAgainstLatestConfig(t *testing.T) { }) } } + +func TestValidate_BalloonVZOnly(t *testing.T) { + // MemoryBalloon should be rejected when vmType is not "vz". + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.QEMU) + y.Memory = ptr.Of("12GiB") + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon.Enabled = ptr.Of(true) + vzOpts.MemoryBalloon.Min = ptr.Of("3GiB") + vzOpts.MemoryBalloon.IdleTarget = ptr.Of("4GiB") + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + err = Validate(y, false) + assert.ErrorContains(t, err, "field `vmOpts.vz.memoryBalloon` requires vmType \"vz\"") +} + +func TestValidate_BalloonThresholds(t *testing.T) { + images := `images: [{"location": "/"}]` + + tests := []struct { + name string + balloon limatype.MemoryBalloon + wantErr string + }{ + { + name: "valid balloon config", + balloon: limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("3GiB"), + IdleTarget: ptr.Of("4GiB"), + Cooldown: ptr.Of("30s"), + }, + wantErr: "", + }, + { + name: "invalid cooldown duration", + balloon: limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("3GiB"), + IdleTarget: ptr.Of("4GiB"), + Cooldown: ptr.Of("not-a-duration"), + }, + wantErr: "field `vmOpts.vz.memoryBalloon.cooldown` must be a valid duration", + }, + { + name: "min greater than idleTarget", + balloon: limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("8GiB"), + IdleTarget: ptr.Of("4GiB"), + }, + wantErr: "field `vmOpts.vz.memoryBalloon.min` must be less than `idleTarget`", + }, + { + name: "disabled balloon skips validation", + balloon: limatype.MemoryBalloon{ + Enabled: ptr.Of(false), + }, + wantErr: "", + }, + { + name: "invalid min byte size", + balloon: limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("not-a-size"), + IdleTarget: ptr.Of("4GiB"), + }, + wantErr: "field `vmOpts.vz.memoryBalloon.min` must be a valid byte size", + }, + { + name: "invalid idleTarget byte size", + balloon: limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("2GiB"), + IdleTarget: ptr.Of("xyz"), + }, + wantErr: "field `vmOpts.vz.memoryBalloon.idleTarget` must be a valid byte size", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + y, err := Load(t.Context(), []byte(images), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.Memory = ptr.Of("12GiB") + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon = tt.balloon + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + err = Validate(y, false) + if tt.wantErr == "" { + assert.NilError(t, err) + } else { + assert.ErrorContains(t, err, tt.wantErr) + } + }) + } +} + +// --- Validation edge case tests --- + +func TestValidate_BalloonWithNilMemory(t *testing.T) { + // Balloon enabled but Memory is nil — should not panic. + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.Memory = nil // No memory set. + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon = limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("2GiB"), + IdleTarget: ptr.Of("4GiB"), + } + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + // Should not panic; validation may succeed or fail but not crash. + _ = Validate(y, false) +} + +func TestValidate_BalloonMinEqualsIdleTarget(t *testing.T) { + // min == idleTarget should fail (must be less than). + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.Memory = ptr.Of("12GiB") + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon = limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("4GiB"), + IdleTarget: ptr.Of("4GiB"), + } + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + err = Validate(y, false) + assert.ErrorContains(t, err, "must be less than") +} + +func TestValidate_BalloonEmptyDuration(t *testing.T) { + // Empty string for cooldown should fail. + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.Memory = ptr.Of("12GiB") + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon = limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("2GiB"), + IdleTarget: ptr.Of("4GiB"), + Cooldown: ptr.Of(""), + } + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + err = Validate(y, false) + assert.ErrorContains(t, err, "cooldown") +} + +func TestValidate_NilVMOptsDoesNotPanic(t *testing.T) { + // VMOpts is nil — validation should not panic. + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.VMOpts = nil + + // Should not panic. + _ = Validate(y, false) +} + +func TestValidate_BalloonIdleTargetExceedsMemory(t *testing.T) { + // idleTarget > Memory should fail. + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.Memory = ptr.Of("8GiB") + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon = limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("2GiB"), + IdleTarget: ptr.Of("16GiB"), + } + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + err = Validate(y, false) + assert.ErrorContains(t, err, "must not exceed") +} + +func TestValidate_BalloonIdleTargetEqualsMemory(t *testing.T) { + // idleTarget == Memory should pass. + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.Memory = ptr.Of("8GiB") + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon = limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("2GiB"), + IdleTarget: ptr.Of("8GiB"), + } + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + err = Validate(y, false) + assert.NilError(t, err) +} + +func TestValidate_BalloonMinZero(t *testing.T) { + // min = "0" should fail (must be > 0). + y, err := Load(t.Context(), []byte(`images: [{"location": "/"}]`), "lima.yaml") + assert.NilError(t, err) + y.VMType = ptr.Of(limatype.VZ) + y.Memory = ptr.Of("12GiB") + + var vzOpts limatype.VZOpts + vzOpts.MemoryBalloon = limatype.MemoryBalloon{ + Enabled: ptr.Of(true), + Min: ptr.Of("0"), + IdleTarget: ptr.Of("4GiB"), + } + var opts any + _ = Convert(vzOpts, &opts, "") + y.VMOpts = limatype.VMOpts{limatype.VZ: opts} + + err = Validate(y, false) + assert.ErrorContains(t, err, "greater than 0") +} diff --git a/website/content/en/docs/config/vmtype/vz.md b/website/content/en/docs/config/vmtype/vz.md index 49915c9eb8c..7f91d7d6c56 100644 --- a/website/content/en/docs/config/vmtype/vz.md +++ b/website/content/en/docs/config/vmtype/vz.md @@ -27,6 +27,39 @@ base: ``` {{% /tab %}} {{< /tabpane >}} + +### Memory Ballooning + +| ⚡ Requirement | Lima >= 2.1.0, macOS >= 13.0, VZ backend only | +|-------------------|--------------------------------------------| + +Memory ballooning dynamically adjusts the guest VM's memory allocation based on actual +usage. When the guest is idle, unused memory is returned to the host. When the guest +needs more memory (detected via PSI — Pressure Stall Information), the balloon grows +automatically. + +This is configured under `vmOpts.vz.memoryBalloon`: + +```yaml +vmType: "vz" +memory: "8GiB" + +vmOpts: + vz: + memoryBalloon: + enabled: true + min: "2GiB" # Floor — balloon never shrinks below this + idleTarget: "3GiB" # Target when VM is idle + cooldown: "30s" # Minimum time between balloon actions +``` + +When `enabled` is not specified, memory ballooning defaults to disabled. When enabled +with no other fields specified, sensible defaults are derived from the configured +`memory` value (e.g., `min` defaults to 25% of `memory`, `idleTarget` to 33%). + +The balloon controller also monitors container CPU/IO activity and swap-in rates to +avoid shrinking memory during active workloads. + ### Caveats - "vz" option is only supported on macOS 13 or above - Virtualization.framework doesn't support running "intel guest on arm" and vice versa From 15686517615b3a968e05137a2de7412c0733668d Mon Sep 17 00:00:00 2001 From: "Jason W. Ehrlich" Date: Fri, 17 Apr 2026 13:50:03 -0400 Subject: [PATCH 2/4] guestagent: add memory metrics collection Add a GetMemoryMetrics gRPC endpoint to the guest agent that reports memory pressure and container activity to the host. The collector reads /proc/meminfo, /proc/pressure/memory, and /proc/vmstat to report memory availability, PSI pressure, swap rates, and page fault rates. Container CPU and IO metrics are collected from cgroupfs (systemd cgroup hierarchy) to avoid a Docker API dependency. The collector is stateful, tracking deltas between polls to compute per-second rates. Counter wraps from reboots produce zero deltas via safeDelta(). Non-Linux platforms return a stub error since /proc is unavailable. Signed-off-by: Jason W. Ehrlich --- cmd/lima-guestagent/daemon_linux.go | 4 +- pkg/guestagent/api/client/client.go | 5 + pkg/guestagent/api/guestservice.pb.desc | 26 +- pkg/guestagent/api/guestservice.pb.go | 239 ++++++++++++- pkg/guestagent/api/guestservice.proto | 36 ++ pkg/guestagent/api/guestservice_grpc.pb.go | 50 ++- pkg/guestagent/api/server/server.go | 13 +- pkg/guestagent/metrics/cgroup_linux.go | 88 +++++ pkg/guestagent/metrics/cgroup_linux_test.go | 92 +++++ pkg/guestagent/metrics/collector_linux.go | 201 +++++++++++ pkg/guestagent/metrics/metrics_linux.go | 126 +++++++ pkg/guestagent/metrics/metrics_linux_test.go | 342 +++++++++++++++++++ pkg/guestagent/metrics/metrics_other.go | 34 ++ pkg/guestagent/metrics/vmstat_linux.go | 50 +++ 14 files changed, 1277 insertions(+), 29 deletions(-) create mode 100644 pkg/guestagent/metrics/cgroup_linux.go create mode 100644 pkg/guestagent/metrics/cgroup_linux_test.go create mode 100644 pkg/guestagent/metrics/collector_linux.go create mode 100644 pkg/guestagent/metrics/metrics_linux.go create mode 100644 pkg/guestagent/metrics/metrics_linux_test.go create mode 100644 pkg/guestagent/metrics/metrics_other.go create mode 100644 pkg/guestagent/metrics/vmstat_linux.go diff --git a/cmd/lima-guestagent/daemon_linux.go b/cmd/lima-guestagent/daemon_linux.go index fd1a72515e4..0a40466774f 100644 --- a/cmd/lima-guestagent/daemon_linux.go +++ b/cmd/lima-guestagent/daemon_linux.go @@ -17,6 +17,7 @@ import ( "github.com/lima-vm/lima/v2/pkg/guestagent" "github.com/lima-vm/lima/v2/pkg/guestagent/api/server" + "github.com/lima-vm/lima/v2/pkg/guestagent/metrics" "github.com/lima-vm/lima/v2/pkg/guestagent/serialport" "github.com/lima-vm/lima/v2/pkg/guestagent/ticker" "github.com/lima-vm/lima/v2/pkg/portfwdserver" @@ -145,5 +146,6 @@ func daemonAction(cmd *cobra.Command, _ []string) error { logrus.Infof("serving the guest agent on %q", socket) } defer logrus.Debug("exiting lima-guestagent daemon") - return server.StartServer(ctx, l, &server.GuestServer{Agent: agent, TunnelS: portfwdserver.NewTunnelServer()}) + collector := metrics.NewCollector() + return server.StartServer(ctx, l, &server.GuestServer{Agent: agent, TunnelS: portfwdserver.NewTunnelServer(), Collector: collector}) } diff --git a/pkg/guestagent/api/client/client.go b/pkg/guestagent/api/client/client.go index 5e37fa54229..a44df4e3c49 100644 --- a/pkg/guestagent/api/client/client.go +++ b/pkg/guestagent/api/client/client.go @@ -89,3 +89,8 @@ func (c *GuestAgentClient) SyncTime(ctx context.Context, hostTime time.Time) (*a } return c.cli.SyncTime(ctx, req) } + +// GetMemoryMetrics retrieves guest memory statistics for the balloon controller. +func (c *GuestAgentClient) GetMemoryMetrics(ctx context.Context) (*api.MemoryMetrics, error) { + return c.cli.GetMemoryMetrics(ctx, &emptypb.Empty{}) +} diff --git a/pkg/guestagent/api/guestservice.pb.desc b/pkg/guestagent/api/guestservice.pb.desc index fc2857a9904..f080c382767 100644 --- a/pkg/guestagent/api/guestservice.pb.desc +++ b/pkg/guestagent/api/guestservice.pb.desc @@ -1,5 +1,5 @@ -� +� guestservice.protogoogle/protobuf/empty.protogoogle/protobuf/timestamp.proto"0 Info( local_ports ( 2.IPPortR @@ -29,10 +29,30 @@ guest_addr ( R guestAddr& TimeSyncResponse adjusted (Radjusted drift_ms (RdriftMs -error ( Rerror2� +error ( Rerror"� + MemoryMetrics& +mem_total_bytes (R memTotalBytes. +mem_available_bytes (RmemAvailableBytes( +mem_cached_bytes (RmemCachedBytes( +swap_total_bytes (RswapTotalBytes& +swap_free_bytes (R swapFreeBytes$ +anon_rss_bytes (R anonRssBytes+ +psi_memory_some_10 (RpsiMemorySome10+ +psi_memory_full_10 (RpsiMemoryFull100 +swap_in_bytes_per_sec (RswapInBytesPerSec2 +swap_out_bytes_per_sec + (RswapOutBytesPerSec& +page_fault_rate (R pageFaultRate' +container_count (RcontainerCount2 +container_cpu_percent (RcontainerCpuPercent: +container_io_bytes_per_sec (RcontainerIoBytesPerSec! + oom_detected (R oomDetected+ +psi_memory_some_60 (RpsiMemorySome60+ +psi_memory_full_60 (RpsiMemoryFull602� GuestService( GetInfo.google.protobuf.Empty.Info- GetEvents.google.protobuf.Empty.Event01 PostInotify.Inotify.google.protobuf.Empty(, Tunnel.TunnelMessage.TunnelMessage(0/ -SyncTime.TimeSyncRequest.TimeSyncResponseB/Z-github.com/lima-vm/lima/v2/pkg/guestagent/apibproto3 \ No newline at end of file +SyncTime.TimeSyncRequest.TimeSyncResponse: +GetMemoryMetrics.google.protobuf.Empty.MemoryMetricsB/Z-github.com/lima-vm/lima/v2/pkg/guestagent/apibproto3 \ No newline at end of file diff --git a/pkg/guestagent/api/guestservice.pb.go b/pkg/guestagent/api/guestservice.pb.go index 90afe8792ec..4c670cc76d3 100644 --- a/pkg/guestagent/api/guestservice.pb.go +++ b/pkg/guestagent/api/guestservice.pb.go @@ -427,6 +427,186 @@ func (x *TimeSyncResponse) GetError() string { return "" } +// MemoryMetrics contains guest memory statistics for the balloon controller. +type MemoryMetrics struct { + state protoimpl.MessageState `protogen:"open.v1"` + // /proc/meminfo fields (bytes). + MemTotalBytes uint64 `protobuf:"varint,1,opt,name=mem_total_bytes,json=memTotalBytes,proto3" json:"mem_total_bytes,omitempty"` + MemAvailableBytes uint64 `protobuf:"varint,2,opt,name=mem_available_bytes,json=memAvailableBytes,proto3" json:"mem_available_bytes,omitempty"` + MemCachedBytes uint64 `protobuf:"varint,3,opt,name=mem_cached_bytes,json=memCachedBytes,proto3" json:"mem_cached_bytes,omitempty"` + SwapTotalBytes uint64 `protobuf:"varint,4,opt,name=swap_total_bytes,json=swapTotalBytes,proto3" json:"swap_total_bytes,omitempty"` + SwapFreeBytes uint64 `protobuf:"varint,5,opt,name=swap_free_bytes,json=swapFreeBytes,proto3" json:"swap_free_bytes,omitempty"` + AnonRssBytes uint64 `protobuf:"varint,6,opt,name=anon_rss_bytes,json=anonRssBytes,proto3" json:"anon_rss_bytes,omitempty"` + // /proc/pressure/memory PSI values (percentage 0.0-100.0 over 10s window). + PsiMemorySome_10 float64 `protobuf:"fixed64,7,opt,name=psi_memory_some_10,json=psiMemorySome10,proto3" json:"psi_memory_some_10,omitempty"` + PsiMemoryFull_10 float64 `protobuf:"fixed64,8,opt,name=psi_memory_full_10,json=psiMemoryFull10,proto3" json:"psi_memory_full_10,omitempty"` + // /proc/vmstat deltas (bytes/sec, computed by agent between polls). + SwapInBytesPerSec float64 `protobuf:"fixed64,9,opt,name=swap_in_bytes_per_sec,json=swapInBytesPerSec,proto3" json:"swap_in_bytes_per_sec,omitempty"` + SwapOutBytesPerSec float64 `protobuf:"fixed64,10,opt,name=swap_out_bytes_per_sec,json=swapOutBytesPerSec,proto3" json:"swap_out_bytes_per_sec,omitempty"` + PageFaultRate float64 `protobuf:"fixed64,11,opt,name=page_fault_rate,json=pageFaultRate,proto3" json:"page_fault_rate,omitempty"` + // Container activity. + ContainerCount int32 `protobuf:"varint,12,opt,name=container_count,json=containerCount,proto3" json:"container_count,omitempty"` + ContainerCpuPercent float64 `protobuf:"fixed64,13,opt,name=container_cpu_percent,json=containerCpuPercent,proto3" json:"container_cpu_percent,omitempty"` + // Container I/O rate in bytes per second across all containers. + ContainerIoBytesPerSec float64 `protobuf:"fixed64,14,opt,name=container_io_bytes_per_sec,json=containerIoBytesPerSec,proto3" json:"container_io_bytes_per_sec,omitempty"` + // OOM detection (edge-triggered: set once, cleared after reporting). + OomDetected bool `protobuf:"varint,15,opt,name=oom_detected,json=oomDetected,proto3" json:"oom_detected,omitempty"` + // /proc/pressure/memory PSI values (percentage 0.0-100.0 over 60s window). + PsiMemorySome_60 float64 `protobuf:"fixed64,16,opt,name=psi_memory_some_60,json=psiMemorySome60,proto3" json:"psi_memory_some_60,omitempty"` + PsiMemoryFull_60 float64 `protobuf:"fixed64,17,opt,name=psi_memory_full_60,json=psiMemoryFull60,proto3" json:"psi_memory_full_60,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *MemoryMetrics) Reset() { + *x = MemoryMetrics{} + mi := &file_guestservice_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *MemoryMetrics) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*MemoryMetrics) ProtoMessage() {} + +func (x *MemoryMetrics) ProtoReflect() protoreflect.Message { + mi := &file_guestservice_proto_msgTypes[7] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use MemoryMetrics.ProtoReflect.Descriptor instead. +func (*MemoryMetrics) Descriptor() ([]byte, []int) { + return file_guestservice_proto_rawDescGZIP(), []int{7} +} + +func (x *MemoryMetrics) GetMemTotalBytes() uint64 { + if x != nil { + return x.MemTotalBytes + } + return 0 +} + +func (x *MemoryMetrics) GetMemAvailableBytes() uint64 { + if x != nil { + return x.MemAvailableBytes + } + return 0 +} + +func (x *MemoryMetrics) GetMemCachedBytes() uint64 { + if x != nil { + return x.MemCachedBytes + } + return 0 +} + +func (x *MemoryMetrics) GetSwapTotalBytes() uint64 { + if x != nil { + return x.SwapTotalBytes + } + return 0 +} + +func (x *MemoryMetrics) GetSwapFreeBytes() uint64 { + if x != nil { + return x.SwapFreeBytes + } + return 0 +} + +func (x *MemoryMetrics) GetAnonRssBytes() uint64 { + if x != nil { + return x.AnonRssBytes + } + return 0 +} + +func (x *MemoryMetrics) GetPsiMemorySome_10() float64 { + if x != nil { + return x.PsiMemorySome_10 + } + return 0 +} + +func (x *MemoryMetrics) GetPsiMemoryFull_10() float64 { + if x != nil { + return x.PsiMemoryFull_10 + } + return 0 +} + +func (x *MemoryMetrics) GetSwapInBytesPerSec() float64 { + if x != nil { + return x.SwapInBytesPerSec + } + return 0 +} + +func (x *MemoryMetrics) GetSwapOutBytesPerSec() float64 { + if x != nil { + return x.SwapOutBytesPerSec + } + return 0 +} + +func (x *MemoryMetrics) GetPageFaultRate() float64 { + if x != nil { + return x.PageFaultRate + } + return 0 +} + +func (x *MemoryMetrics) GetContainerCount() int32 { + if x != nil { + return x.ContainerCount + } + return 0 +} + +func (x *MemoryMetrics) GetContainerCpuPercent() float64 { + if x != nil { + return x.ContainerCpuPercent + } + return 0 +} + +func (x *MemoryMetrics) GetContainerIoBytesPerSec() float64 { + if x != nil { + return x.ContainerIoBytesPerSec + } + return 0 +} + +func (x *MemoryMetrics) GetOomDetected() bool { + if x != nil { + return x.OomDetected + } + return false +} + +func (x *MemoryMetrics) GetPsiMemorySome_60() float64 { + if x != nil { + return x.PsiMemorySome_60 + } + return 0 +} + +func (x *MemoryMetrics) GetPsiMemoryFull_60() float64 { + if x != nil { + return x.PsiMemoryFull_60 + } + return 0 +} + var File_guestservice_proto protoreflect.FileDescriptor const file_guestservice_proto_rawDesc = "" + @@ -460,13 +640,33 @@ const file_guestservice_proto_rawDesc = "" + "\x10TimeSyncResponse\x12\x1a\n" + "\badjusted\x18\x01 \x01(\bR\badjusted\x12\x19\n" + "\bdrift_ms\x18\x02 \x01(\x03R\adriftMs\x12\x14\n" + - "\x05error\x18\x03 \x01(\tR\x05error2\xf9\x01\n" + + "\x05error\x18\x03 \x01(\tR\x05error\"\x87\x06\n" + + "\rMemoryMetrics\x12&\n" + + "\x0fmem_total_bytes\x18\x01 \x01(\x04R\rmemTotalBytes\x12.\n" + + "\x13mem_available_bytes\x18\x02 \x01(\x04R\x11memAvailableBytes\x12(\n" + + "\x10mem_cached_bytes\x18\x03 \x01(\x04R\x0ememCachedBytes\x12(\n" + + "\x10swap_total_bytes\x18\x04 \x01(\x04R\x0eswapTotalBytes\x12&\n" + + "\x0fswap_free_bytes\x18\x05 \x01(\x04R\rswapFreeBytes\x12$\n" + + "\x0eanon_rss_bytes\x18\x06 \x01(\x04R\fanonRssBytes\x12+\n" + + "\x12psi_memory_some_10\x18\a \x01(\x01R\x0fpsiMemorySome10\x12+\n" + + "\x12psi_memory_full_10\x18\b \x01(\x01R\x0fpsiMemoryFull10\x120\n" + + "\x15swap_in_bytes_per_sec\x18\t \x01(\x01R\x11swapInBytesPerSec\x122\n" + + "\x16swap_out_bytes_per_sec\x18\n" + + " \x01(\x01R\x12swapOutBytesPerSec\x12&\n" + + "\x0fpage_fault_rate\x18\v \x01(\x01R\rpageFaultRate\x12'\n" + + "\x0fcontainer_count\x18\f \x01(\x05R\x0econtainerCount\x122\n" + + "\x15container_cpu_percent\x18\r \x01(\x01R\x13containerCpuPercent\x12:\n" + + "\x1acontainer_io_bytes_per_sec\x18\x0e \x01(\x01R\x16containerIoBytesPerSec\x12!\n" + + "\foom_detected\x18\x0f \x01(\bR\voomDetected\x12+\n" + + "\x12psi_memory_some_60\x18\x10 \x01(\x01R\x0fpsiMemorySome60\x12+\n" + + "\x12psi_memory_full_60\x18\x11 \x01(\x01R\x0fpsiMemoryFull602\xb5\x02\n" + "\fGuestService\x12(\n" + "\aGetInfo\x12\x16.google.protobuf.Empty\x1a\x05.Info\x12-\n" + "\tGetEvents\x12\x16.google.protobuf.Empty\x1a\x06.Event0\x01\x121\n" + "\vPostInotify\x12\b.Inotify\x1a\x16.google.protobuf.Empty(\x01\x12,\n" + "\x06Tunnel\x12\x0e.TunnelMessage\x1a\x0e.TunnelMessage(\x010\x01\x12/\n" + - "\bSyncTime\x12\x10.TimeSyncRequest\x1a\x11.TimeSyncResponseB/Z-github.com/lima-vm/lima/v2/pkg/guestagent/apib\x06proto3" + "\bSyncTime\x12\x10.TimeSyncRequest\x1a\x11.TimeSyncResponse\x12:\n" + + "\x10GetMemoryMetrics\x12\x16.google.protobuf.Empty\x1a\x0e.MemoryMetricsB/Z-github.com/lima-vm/lima/v2/pkg/guestagent/apib\x06proto3" var ( file_guestservice_proto_rawDescOnce sync.Once @@ -480,7 +680,7 @@ func file_guestservice_proto_rawDescGZIP() []byte { return file_guestservice_proto_rawDescData } -var file_guestservice_proto_msgTypes = make([]protoimpl.MessageInfo, 7) +var file_guestservice_proto_msgTypes = make([]protoimpl.MessageInfo, 8) var file_guestservice_proto_goTypes = []any{ (*Info)(nil), // 0: Info (*Event)(nil), // 1: Event @@ -489,28 +689,31 @@ var file_guestservice_proto_goTypes = []any{ (*TunnelMessage)(nil), // 4: TunnelMessage (*TimeSyncRequest)(nil), // 5: TimeSyncRequest (*TimeSyncResponse)(nil), // 6: TimeSyncResponse - (*timestamppb.Timestamp)(nil), // 7: google.protobuf.Timestamp - (*emptypb.Empty)(nil), // 8: google.protobuf.Empty + (*MemoryMetrics)(nil), // 7: MemoryMetrics + (*timestamppb.Timestamp)(nil), // 8: google.protobuf.Timestamp + (*emptypb.Empty)(nil), // 9: google.protobuf.Empty } var file_guestservice_proto_depIdxs = []int32{ 2, // 0: Info.local_ports:type_name -> IPPort - 7, // 1: Event.time:type_name -> google.protobuf.Timestamp + 8, // 1: Event.time:type_name -> google.protobuf.Timestamp 2, // 2: Event.added_local_ports:type_name -> IPPort 2, // 3: Event.removed_local_ports:type_name -> IPPort - 7, // 4: Inotify.time:type_name -> google.protobuf.Timestamp - 7, // 5: TimeSyncRequest.host_time:type_name -> google.protobuf.Timestamp - 8, // 6: GuestService.GetInfo:input_type -> google.protobuf.Empty - 8, // 7: GuestService.GetEvents:input_type -> google.protobuf.Empty + 8, // 4: Inotify.time:type_name -> google.protobuf.Timestamp + 8, // 5: TimeSyncRequest.host_time:type_name -> google.protobuf.Timestamp + 9, // 6: GuestService.GetInfo:input_type -> google.protobuf.Empty + 9, // 7: GuestService.GetEvents:input_type -> google.protobuf.Empty 3, // 8: GuestService.PostInotify:input_type -> Inotify 4, // 9: GuestService.Tunnel:input_type -> TunnelMessage 5, // 10: GuestService.SyncTime:input_type -> TimeSyncRequest - 0, // 11: GuestService.GetInfo:output_type -> Info - 1, // 12: GuestService.GetEvents:output_type -> Event - 8, // 13: GuestService.PostInotify:output_type -> google.protobuf.Empty - 4, // 14: GuestService.Tunnel:output_type -> TunnelMessage - 6, // 15: GuestService.SyncTime:output_type -> TimeSyncResponse - 11, // [11:16] is the sub-list for method output_type - 6, // [6:11] is the sub-list for method input_type + 9, // 11: GuestService.GetMemoryMetrics:input_type -> google.protobuf.Empty + 0, // 12: GuestService.GetInfo:output_type -> Info + 1, // 13: GuestService.GetEvents:output_type -> Event + 9, // 14: GuestService.PostInotify:output_type -> google.protobuf.Empty + 4, // 15: GuestService.Tunnel:output_type -> TunnelMessage + 6, // 16: GuestService.SyncTime:output_type -> TimeSyncResponse + 7, // 17: GuestService.GetMemoryMetrics:output_type -> MemoryMetrics + 12, // [12:18] is the sub-list for method output_type + 6, // [6:12] is the sub-list for method input_type 6, // [6:6] is the sub-list for extension type_name 6, // [6:6] is the sub-list for extension extendee 0, // [0:6] is the sub-list for field type_name @@ -527,7 +730,7 @@ func file_guestservice_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_guestservice_proto_rawDesc), len(file_guestservice_proto_rawDesc)), NumEnums: 0, - NumMessages: 7, + NumMessages: 8, NumExtensions: 0, NumServices: 1, }, diff --git a/pkg/guestagent/api/guestservice.proto b/pkg/guestagent/api/guestservice.proto index f23218325af..acab6730790 100644 --- a/pkg/guestagent/api/guestservice.proto +++ b/pkg/guestagent/api/guestservice.proto @@ -12,6 +12,9 @@ service GuestService { rpc Tunnel(stream TunnelMessage) returns (stream TunnelMessage); rpc SyncTime(TimeSyncRequest) returns (TimeSyncResponse); + + // GetMemoryMetrics returns current guest memory metrics for the balloon controller. + rpc GetMemoryMetrics(google.protobuf.Empty) returns (MemoryMetrics); } message Info { @@ -53,3 +56,36 @@ message TimeSyncResponse { int64 drift_ms = 2; string error = 3; } + +// MemoryMetrics contains guest memory statistics for the balloon controller. +message MemoryMetrics { + // /proc/meminfo fields (bytes). + uint64 mem_total_bytes = 1; + uint64 mem_available_bytes = 2; + uint64 mem_cached_bytes = 3; + uint64 swap_total_bytes = 4; + uint64 swap_free_bytes = 5; + uint64 anon_rss_bytes = 6; + + // /proc/pressure/memory PSI values (percentage 0.0-100.0 over 10s window). + double psi_memory_some_10 = 7; + double psi_memory_full_10 = 8; + + // /proc/vmstat deltas (bytes/sec, computed by agent between polls). + double swap_in_bytes_per_sec = 9; + double swap_out_bytes_per_sec = 10; + double page_fault_rate = 11; + + // Container activity. + int32 container_count = 12; + double container_cpu_percent = 13; + // Container I/O rate in bytes per second across all containers. + double container_io_bytes_per_sec = 14; + + // OOM detection (edge-triggered: set once, cleared after reporting). + bool oom_detected = 15; + + // /proc/pressure/memory PSI values (percentage 0.0-100.0 over 60s window). + double psi_memory_some_60 = 16; + double psi_memory_full_60 = 17; +} diff --git a/pkg/guestagent/api/guestservice_grpc.pb.go b/pkg/guestagent/api/guestservice_grpc.pb.go index 3c73fcb0523..11849f37917 100644 --- a/pkg/guestagent/api/guestservice_grpc.pb.go +++ b/pkg/guestagent/api/guestservice_grpc.pb.go @@ -20,11 +20,12 @@ import ( const _ = grpc.SupportPackageIsVersion9 const ( - GuestService_GetInfo_FullMethodName = "/GuestService/GetInfo" - GuestService_GetEvents_FullMethodName = "/GuestService/GetEvents" - GuestService_PostInotify_FullMethodName = "/GuestService/PostInotify" - GuestService_Tunnel_FullMethodName = "/GuestService/Tunnel" - GuestService_SyncTime_FullMethodName = "/GuestService/SyncTime" + GuestService_GetInfo_FullMethodName = "/GuestService/GetInfo" + GuestService_GetEvents_FullMethodName = "/GuestService/GetEvents" + GuestService_PostInotify_FullMethodName = "/GuestService/PostInotify" + GuestService_Tunnel_FullMethodName = "/GuestService/Tunnel" + GuestService_SyncTime_FullMethodName = "/GuestService/SyncTime" + GuestService_GetMemoryMetrics_FullMethodName = "/GuestService/GetMemoryMetrics" ) // GuestServiceClient is the client API for GuestService service. @@ -36,6 +37,8 @@ type GuestServiceClient interface { PostInotify(ctx context.Context, opts ...grpc.CallOption) (grpc.ClientStreamingClient[Inotify, emptypb.Empty], error) Tunnel(ctx context.Context, opts ...grpc.CallOption) (grpc.BidiStreamingClient[TunnelMessage, TunnelMessage], error) SyncTime(ctx context.Context, in *TimeSyncRequest, opts ...grpc.CallOption) (*TimeSyncResponse, error) + // GetMemoryMetrics returns current guest memory metrics for the balloon controller. + GetMemoryMetrics(ctx context.Context, in *emptypb.Empty, opts ...grpc.CallOption) (*MemoryMetrics, error) } type guestServiceClient struct { @@ -111,6 +114,16 @@ func (c *guestServiceClient) SyncTime(ctx context.Context, in *TimeSyncRequest, return out, nil } +func (c *guestServiceClient) GetMemoryMetrics(ctx context.Context, in *emptypb.Empty, opts ...grpc.CallOption) (*MemoryMetrics, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(MemoryMetrics) + err := c.cc.Invoke(ctx, GuestService_GetMemoryMetrics_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + // GuestServiceServer is the server API for GuestService service. // All implementations must embed UnimplementedGuestServiceServer // for forward compatibility. @@ -120,6 +133,8 @@ type GuestServiceServer interface { PostInotify(grpc.ClientStreamingServer[Inotify, emptypb.Empty]) error Tunnel(grpc.BidiStreamingServer[TunnelMessage, TunnelMessage]) error SyncTime(context.Context, *TimeSyncRequest) (*TimeSyncResponse, error) + // GetMemoryMetrics returns current guest memory metrics for the balloon controller. + GetMemoryMetrics(context.Context, *emptypb.Empty) (*MemoryMetrics, error) mustEmbedUnimplementedGuestServiceServer() } @@ -145,6 +160,9 @@ func (UnimplementedGuestServiceServer) Tunnel(grpc.BidiStreamingServer[TunnelMes func (UnimplementedGuestServiceServer) SyncTime(context.Context, *TimeSyncRequest) (*TimeSyncResponse, error) { return nil, status.Errorf(codes.Unimplemented, "method SyncTime not implemented") } +func (UnimplementedGuestServiceServer) GetMemoryMetrics(context.Context, *emptypb.Empty) (*MemoryMetrics, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetMemoryMetrics not implemented") +} func (UnimplementedGuestServiceServer) mustEmbedUnimplementedGuestServiceServer() {} func (UnimplementedGuestServiceServer) testEmbeddedByValue() {} @@ -227,6 +245,24 @@ func _GuestService_SyncTime_Handler(srv interface{}, ctx context.Context, dec fu return interceptor(ctx, in, info, handler) } +func _GuestService_GetMemoryMetrics_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(emptypb.Empty) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(GuestServiceServer).GetMemoryMetrics(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: GuestService_GetMemoryMetrics_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(GuestServiceServer).GetMemoryMetrics(ctx, req.(*emptypb.Empty)) + } + return interceptor(ctx, in, info, handler) +} + // GuestService_ServiceDesc is the grpc.ServiceDesc for GuestService service. // It's only intended for direct use with grpc.RegisterService, // and not to be introspected or modified (even as a copy) @@ -242,6 +278,10 @@ var GuestService_ServiceDesc = grpc.ServiceDesc{ MethodName: "SyncTime", Handler: _GuestService_SyncTime_Handler, }, + { + MethodName: "GetMemoryMetrics", + Handler: _GuestService_GetMemoryMetrics_Handler, + }, }, Streams: []grpc.StreamDesc{ { diff --git a/pkg/guestagent/api/server/server.go b/pkg/guestagent/api/server/server.go index 9b4aed4d2e8..eb5b5f480d7 100644 --- a/pkg/guestagent/api/server/server.go +++ b/pkg/guestagent/api/server/server.go @@ -15,6 +15,7 @@ import ( "github.com/lima-vm/lima/v2/pkg/guestagent" "github.com/lima-vm/lima/v2/pkg/guestagent/api" + "github.com/lima-vm/lima/v2/pkg/guestagent/metrics" "github.com/lima-vm/lima/v2/pkg/guestagent/timesync" "github.com/lima-vm/lima/v2/pkg/portfwdserver" ) @@ -46,8 +47,9 @@ func StartServer(ctx context.Context, lis net.Listener, guest *GuestServer) erro type GuestServer struct { api.UnimplementedGuestServiceServer - Agent guestagent.Agent - TunnelS *portfwdserver.TunnelServer + Agent guestagent.Agent + TunnelS *portfwdserver.TunnelServer + Collector *metrics.Collector } func (s *GuestServer) GetInfo(ctx context.Context, _ *emptypb.Empty) (*api.Info, error) { @@ -106,3 +108,10 @@ func (s *GuestServer) SyncTime(_ context.Context, req *api.TimeSyncRequest) (*ap return resp, nil } + +func (s *GuestServer) GetMemoryMetrics(ctx context.Context, _ *emptypb.Empty) (*api.MemoryMetrics, error) { + if s.Collector != nil { + return s.Collector.Collect(ctx) + } + return metrics.CollectMemoryMetrics() +} diff --git a/pkg/guestagent/metrics/cgroup_linux.go b/pkg/guestagent/metrics/cgroup_linux.go new file mode 100644 index 00000000000..b0513e7865d --- /dev/null +++ b/pkg/guestagent/metrics/cgroup_linux.go @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package metrics + +import ( + "bufio" + "bytes" + "os" + "path/filepath" + "strconv" + "strings" +) + +// containerCgroupPaths returns cgroup directories for running containers. +// It looks for scope units under well-known systemd slices used by +// Docker, Podman, and containerd (rootful only — cgroupv2). +func containerCgroupPaths() []string { + var paths []string + // Docker rootful: /sys/fs/cgroup/system.slice/docker-.scope + // Podman rootful: /sys/fs/cgroup/machine.slice/libpod-.scope + // containerd: /sys/fs/cgroup/system.slice/containerd-.scope (or under containerd.service) + for _, pattern := range []string{ + "/sys/fs/cgroup/system.slice/docker-*.scope", + "/sys/fs/cgroup/machine.slice/libpod-*.scope", + "/sys/fs/cgroup/system.slice/containerd-*.scope", + } { + matches, _ := filepath.Glob(pattern) + paths = append(paths, matches...) + } + return paths +} + +// cgroupCPUUsage reads cpu.stat from a cgroupv2 directory and returns +// the cumulative usage_usec value (microseconds of CPU time). +func cgroupCPUUsage(cgroupDir string) (uint64, error) { + data, err := os.ReadFile(filepath.Join(cgroupDir, "cpu.stat")) + if err != nil { + return 0, err + } + return parseCgroupCPUUsage(data) +} + +func parseCgroupCPUUsage(data []byte) (uint64, error) { + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + fields := strings.Fields(scanner.Text()) + if len(fields) == 2 && fields[0] == "usage_usec" { + return strconv.ParseUint(fields[1], 10, 64) + } + } + return 0, scanner.Err() +} + +// cgroupIOBytes reads io.stat from a cgroupv2 directory and returns +// the total bytes (read + written) across all devices. +func cgroupIOBytes(cgroupDir string) uint64 { + data, err := os.ReadFile(filepath.Join(cgroupDir, "io.stat")) + if err != nil { + return 0 + } + return parseCgroupIOBytes(data) +} + +// parseCgroupIOBytes parses cgroupv2 io.stat format: +// +// : rbytes= wbytes= ... +func parseCgroupIOBytes(data []byte) uint64 { + var total uint64 + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + for _, field := range strings.Fields(scanner.Text()) { + if after, ok := strings.CutPrefix(field, "rbytes="); ok { + if v, err := strconv.ParseUint(after, 10, 64); err == nil { + total += v + } + } + if after, ok := strings.CutPrefix(field, "wbytes="); ok { + if v, err := strconv.ParseUint(after, 10, 64); err == nil { + total += v + } + } + } + } + return total +} diff --git a/pkg/guestagent/metrics/cgroup_linux_test.go b/pkg/guestagent/metrics/cgroup_linux_test.go new file mode 100644 index 00000000000..952ba4bc2e5 --- /dev/null +++ b/pkg/guestagent/metrics/cgroup_linux_test.go @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package metrics + +import ( + "testing" + + "gotest.tools/v3/assert" +) + +func TestParseCgroupCPUUsage(t *testing.T) { + tests := []struct { + name string + input string + want uint64 + wantErr bool + }{ + { + name: "typical", + input: `usage_usec 1234567 +user_usec 1000000 +system_usec 234567 +nr_periods 0 +nr_throttled 0 +throttled_usec 0 +`, + want: 1234567, + }, + { + name: "zero", + input: "usage_usec 0\n", + want: 0, + }, + { + name: "missing field", + input: "user_usec 1000000\nsystem_usec 234567\n", + wantErr: false, // Returns 0, nil when field not found. + want: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := parseCgroupCPUUsage([]byte(tt.input)) + if tt.wantErr { + assert.Assert(t, err != nil, "expected error") + return + } + assert.NilError(t, err) + assert.Equal(t, got, tt.want) + }) + } +} + +func TestParseCgroupIOBytes(t *testing.T) { + tests := []struct { + name string + input string + want uint64 + }{ + { + name: "single device", + input: "259:0 rbytes=1048576 wbytes=524288 rios=100 wios=50 dbytes=0 dios=0\n", + want: 1048576 + 524288, + }, + { + name: "multiple devices", + input: `259:0 rbytes=1000 wbytes=2000 rios=10 wios=20 dbytes=0 dios=0 +259:1 rbytes=3000 wbytes=4000 rios=30 wios=40 dbytes=0 dios=0 +`, + want: 1000 + 2000 + 3000 + 4000, + }, + { + name: "empty", + input: "", + want: 0, + }, + { + name: "no rbytes or wbytes", + input: "259:0 rios=100 wios=50\n", + want: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := parseCgroupIOBytes([]byte(tt.input)) + assert.Equal(t, got, tt.want) + }) + } +} diff --git a/pkg/guestagent/metrics/collector_linux.go b/pkg/guestagent/metrics/collector_linux.go new file mode 100644 index 00000000000..dccfe5ab26e --- /dev/null +++ b/pkg/guestagent/metrics/collector_linux.go @@ -0,0 +1,201 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package metrics + +import ( + "context" + "fmt" + "os" + "sync" + "time" + + "github.com/lima-vm/lima/v2/pkg/guestagent/api" +) + +const pageSize = 4096 // Linux page size in bytes. + +// Collector is a stateful memory metrics collector that tracks deltas +// between successive /proc/vmstat samples and reads cgroupfs for +// container activity. It is goroutine-safe. +type Collector struct { + mu sync.Mutex + + // Previous vmstat sample for delta computation. + prevVmstat vmstatCounters + prevVmstatTime time.Time + hasPrevVmstat bool + + // Computed rates (updated by computeVmstatDeltas). + swapInBytesPerSec float64 + swapOutBytesPerSec float64 + pageFaultRate float64 + oomDetected bool + + // Previous per-container cgroup CPU/IO samples for rate computation. + prevCgroupCPU map[string]uint64 // path → usage_usec + prevCgroupIO map[string]uint64 // path → total bytes + prevCgroupTime time.Time +} + +// NewCollector creates a new stateful metrics collector. +func NewCollector() *Collector { + return &Collector{ + prevCgroupCPU: make(map[string]uint64), + prevCgroupIO: make(map[string]uint64), + } +} + +// computeVmstatDeltas updates the collector's rate fields from a new +// vmstat sample. Must be called with c.mu held. +func (c *Collector) computeVmstatDeltas(vs vmstatCounters, now time.Time) { + if !c.hasPrevVmstat { + c.prevVmstat = vs + c.prevVmstatTime = now + c.hasPrevVmstat = true + return + } + + dt := now.Sub(c.prevVmstatTime).Seconds() + if dt <= 0 { + return + } + + // Compute deltas; treat counter decreases (reboot) as zero. + swapInDelta := safeDelta(vs.pswpin, c.prevVmstat.pswpin) + swapOutDelta := safeDelta(vs.pswpout, c.prevVmstat.pswpout) + pgfaultDelta := safeDelta(vs.pgfault, c.prevVmstat.pgfault) + oomDelta := safeDelta(vs.oomKill, c.prevVmstat.oomKill) + + c.swapInBytesPerSec = float64(swapInDelta) * pageSize / dt + c.swapOutBytesPerSec = float64(swapOutDelta) * pageSize / dt + c.pageFaultRate = float64(pgfaultDelta) / dt + + if oomDelta > 0 { + c.oomDetected = true + } + + c.prevVmstat = vs + c.prevVmstatTime = now +} + +// consumeOomDetected returns and clears the OOM detected flag. +// This implements edge-triggered semantics: the flag is set when a new +// OOM kill is detected and cleared after being read once. +func (c *Collector) consumeOomDetected() bool { + val := c.oomDetected + c.oomDetected = false + return val +} + +// safeDelta returns curr - prev if curr >= prev, else 0. +func safeDelta(curr, prev uint64) uint64 { + if curr >= prev { + return curr - prev + } + return 0 +} + +// Collect gathers all memory metrics and returns a MemoryMetrics protobuf. +// This is the main entry point called by the guest agent gRPC server. +func (c *Collector) Collect(ctx context.Context) (*api.MemoryMetrics, error) { + // 1. /proc/meminfo + /proc/pressure/memory (no lock needed — pure reads). + meminfo, err := os.ReadFile("/proc/meminfo") + if err != nil { + return nil, fmt.Errorf("failed to read /proc/meminfo: %w", err) + } + m, err := parseProcMeminfo(meminfo) + if err != nil { + return nil, err + } + + pressure, _ := os.ReadFile("/proc/pressure/memory") + psi, parseErr := parseProcPressureMemory(pressure) + if parseErr != nil { + return nil, parseErr + } + m.PsiMemorySome_10 = psi.Some10 + m.PsiMemoryFull_10 = psi.Full10 + m.PsiMemorySome_60 = psi.Some60 + m.PsiMemoryFull_60 = psi.Full60 + + // 2. /proc/vmstat for swap rates, page faults, OOM. + vmstatData, vmstatErr := os.ReadFile("/proc/vmstat") + + // 3. Container stats from cgroupfs (best-effort). + now := time.Now() + cgroupPaths := containerCgroupPaths() + containerCount := len(cgroupPaths) + var containerCPU, containerIO float64 + + // Hold lock for internal state updates. + c.mu.Lock() + defer c.mu.Unlock() + + if vmstatErr == nil { + vs, parseErr := parseProcVmstat(vmstatData) + if parseErr == nil { + c.computeVmstatDeltas(vs, now) + } + } + m.SwapInBytesPerSec = c.swapInBytesPerSec + m.SwapOutBytesPerSec = c.swapOutBytesPerSec + m.PageFaultRate = c.pageFaultRate + m.OomDetected = c.consumeOomDetected() + + // Compute container CPU% and IO rates from cgroup deltas. + if containerCount > 0 { + containerCPU, containerIO = c.computeCgroupDeltas(cgroupPaths, now) + } else { + // No containers — reset previous samples. + c.prevCgroupCPU = make(map[string]uint64) + c.prevCgroupIO = make(map[string]uint64) + } + + m.ContainerCount = int32(containerCount) + m.ContainerCpuPercent = containerCPU + m.ContainerIoBytesPerSec = containerIO + + return m, nil +} + +// computeCgroupDeltas reads current CPU/IO from each container cgroup, +// computes rates against previous samples, and stores current values. +// Must be called with c.mu held. +func (c *Collector) computeCgroupDeltas(paths []string, now time.Time) (cpuPercent, ioBytesPerSec float64) { + dt := now.Sub(c.prevCgroupTime).Seconds() + hasPrev := len(c.prevCgroupCPU) > 0 && dt > 0 + + newCPU := make(map[string]uint64, len(paths)) + newIO := make(map[string]uint64, len(paths)) + + for _, p := range paths { + cpu, err := cgroupCPUUsage(p) + if err != nil { + continue + } + io := cgroupIOBytes(p) + newCPU[p] = cpu + newIO[p] = io + + if hasPrev { + if prevCPU, ok := c.prevCgroupCPU[p]; ok { + // usage_usec delta → percentage of wall time. + cpuDelta := safeDelta(cpu, prevCPU) + // Convert microseconds to seconds, then to percent of wall time. + cpuPercent += float64(cpuDelta) / (dt * 1e6) * 100.0 + } + if prevIO, ok := c.prevCgroupIO[p]; ok { + ioDelta := safeDelta(io, prevIO) + ioBytesPerSec += float64(ioDelta) / dt + } + } + } + + c.prevCgroupCPU = newCPU + c.prevCgroupIO = newIO + c.prevCgroupTime = now + return cpuPercent, ioBytesPerSec +} diff --git a/pkg/guestagent/metrics/metrics_linux.go b/pkg/guestagent/metrics/metrics_linux.go new file mode 100644 index 00000000000..d6f4cffc3ea --- /dev/null +++ b/pkg/guestagent/metrics/metrics_linux.go @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +// Package metrics reads guest memory statistics from /proc for the balloon controller. +package metrics + +import ( + "bufio" + "bytes" + "fmt" + "os" + "strconv" + "strings" + + "github.com/lima-vm/lima/v2/pkg/guestagent/api" +) + +// CollectMemoryMetrics reads /proc/meminfo and /proc/pressure/memory +// and returns a MemoryMetrics protobuf message. +func CollectMemoryMetrics() (*api.MemoryMetrics, error) { + meminfo, err := os.ReadFile("/proc/meminfo") + if err != nil { + return nil, fmt.Errorf("failed to read /proc/meminfo: %w", err) + } + m, err := parseProcMeminfo(meminfo) + if err != nil { + return nil, err + } + + pressure, _ := os.ReadFile("/proc/pressure/memory") + psi, err := parseProcPressureMemory(pressure) + if err != nil { + return nil, err + } + m.PsiMemorySome_10 = psi.Some10 + m.PsiMemoryFull_10 = psi.Full10 + m.PsiMemorySome_60 = psi.Some60 + m.PsiMemoryFull_60 = psi.Full60 + + return m, nil +} + +func parseProcMeminfo(data []byte) (*api.MemoryMetrics, error) { + m := &api.MemoryMetrics{} + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + key := strings.TrimSuffix(parts[0], ":") + val, err := strconv.ParseUint(parts[1], 10, 64) + if err != nil { + continue + } + // /proc/meminfo values are in kB. + valBytes := val * 1024 + switch key { + case "MemTotal": + m.MemTotalBytes = valBytes + case "MemAvailable": + m.MemAvailableBytes = valBytes + case "Cached": + m.MemCachedBytes = valBytes + case "SwapTotal": + m.SwapTotalBytes = valBytes + case "SwapFree": + m.SwapFreeBytes = valBytes + case "AnonPages": + m.AnonRssBytes = valBytes + } + } + return m, scanner.Err() +} + +// PressureStats holds parsed PSI values from /proc/pressure/memory. +type PressureStats struct { + Some10, Full10 float64 + Some60, Full60 float64 +} + +func parseProcPressureMemory(data []byte) (PressureStats, error) { + var ps PressureStats + if len(data) == 0 { + return ps, nil + } + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + kind := fields[0] + for _, field := range fields[1:] { + if after, ok := strings.CutPrefix(field, "avg10="); ok { + val, parseErr := strconv.ParseFloat(after, 64) + if parseErr != nil { + continue + } + switch kind { + case "some": + ps.Some10 = val + case "full": + ps.Full10 = val + } + } + if after, ok := strings.CutPrefix(field, "avg60="); ok { + val, parseErr := strconv.ParseFloat(after, 64) + if parseErr != nil { + continue + } + switch kind { + case "some": + ps.Some60 = val + case "full": + ps.Full60 = val + } + } + } + } + return ps, scanner.Err() +} diff --git a/pkg/guestagent/metrics/metrics_linux_test.go b/pkg/guestagent/metrics/metrics_linux_test.go new file mode 100644 index 00000000000..7086e8536e9 --- /dev/null +++ b/pkg/guestagent/metrics/metrics_linux_test.go @@ -0,0 +1,342 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package metrics + +import ( + "testing" + "time" + + "gotest.tools/v3/assert" +) + +func TestParseProcMeminfo(t *testing.T) { + data := `MemTotal: 12288000 kB +MemFree: 1024000 kB +MemAvailable: 6144000 kB +Buffers: 512000 kB +Cached: 3072000 kB +SwapCached: 256000 kB +Active: 4096000 kB +Inactive: 2048000 kB +Active(anon): 3000000 kB +Inactive(anon): 1000000 kB +Active(file): 1096000 kB +Inactive(file): 1048000 kB +SwapTotal: 8192000 kB +SwapFree: 7168000 kB +AnonPages: 3500000 kB +` + m, err := parseProcMeminfo([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, m.MemTotalBytes, uint64(12288000*1024)) + assert.Equal(t, m.MemAvailableBytes, uint64(6144000*1024)) + assert.Equal(t, m.MemCachedBytes, uint64(3072000*1024)) + assert.Equal(t, m.SwapTotalBytes, uint64(8192000*1024)) + assert.Equal(t, m.SwapFreeBytes, uint64(7168000*1024)) + assert.Equal(t, m.AnonRssBytes, uint64(3500000*1024)) +} + +func TestParseProcPressureMemory(t *testing.T) { + data := `some avg10=5.50 avg60=3.20 avg300=1.10 total=123456 +full avg10=1.25 avg60=0.80 avg300=0.30 total=789012 +` + ps, err := parseProcPressureMemory([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 5.50) + assert.Equal(t, ps.Full10, 1.25) + assert.Equal(t, ps.Some60, 3.20) + assert.Equal(t, ps.Full60, 0.80) +} + +func TestParseProcPressureMemory_NoPSI(t *testing.T) { + // When PSI is not available, return zeros. + _, err := parseProcPressureMemory(nil) + assert.NilError(t, err) +} + +// --- Edge case tests --- + +func TestParseProcMeminfo_Empty(t *testing.T) { + m, err := parseProcMeminfo([]byte("")) + assert.NilError(t, err) + assert.Equal(t, m.MemTotalBytes, uint64(0)) + assert.Equal(t, m.AnonRssBytes, uint64(0)) +} + +func TestParseProcMeminfo_MalformedLines(t *testing.T) { + // Lines with no value, non-numeric value, single field. + data := `MemTotal: +MemFree: notanumber kB +Cached: 1024 kB +justoneword +: +` + m, err := parseProcMeminfo([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, m.MemCachedBytes, uint64(1024*1024)) + assert.Equal(t, m.MemTotalBytes, uint64(0)) // Could not parse. +} + +func TestParseProcMeminfo_MissingFields(t *testing.T) { + // Only MemTotal present; all other fields stay zero. + data := `MemTotal: 8000000 kB +` + m, err := parseProcMeminfo([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, m.MemTotalBytes, uint64(8000000*1024)) + assert.Equal(t, m.MemAvailableBytes, uint64(0)) + assert.Equal(t, m.SwapTotalBytes, uint64(0)) + assert.Equal(t, m.AnonRssBytes, uint64(0)) +} + +func TestParseProcMeminfo_ExtraWhitespace(t *testing.T) { + data := `MemTotal: 16384000 kB +AnonPages: 2000000 kB +` + m, err := parseProcMeminfo([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, m.MemTotalBytes, uint64(16384000*1024)) + assert.Equal(t, m.AnonRssBytes, uint64(2000000*1024)) +} + +func TestParseProcPressureMemory_Empty(t *testing.T) { + ps, err := parseProcPressureMemory([]byte("")) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 0.0) + assert.Equal(t, ps.Full10, 0.0) +} + +func TestParseProcPressureMemory_PartialData(t *testing.T) { + // Only "some" line, no "full" line. + data := `some avg10=3.14 avg60=2.00 avg300=1.00 total=99999 +` + ps, err := parseProcPressureMemory([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 3.14) + assert.Equal(t, ps.Full10, 0.0) // Not present. + assert.Equal(t, ps.Some60, 2.00) +} + +func TestParseProcPressureMemory_MalformedAvg(t *testing.T) { + // avg10= has non-numeric value — should be silently skipped. + data := `some avg10=notfloat avg60=2.00 avg300=1.00 total=99999 +full avg10=1.50 avg60=0.80 avg300=0.30 total=789012 +` + ps, err := parseProcPressureMemory([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 0.0) // Could not parse. + assert.Equal(t, ps.Full10, 1.50) +} + +func TestParseProcPressureMemory_NoAvg10Field(t *testing.T) { + // Lines without avg10= field. + data := `some avg60=2.00 avg300=1.00 total=99999 +full avg60=0.80 avg300=0.30 total=789012 +` + ps, err := parseProcPressureMemory([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 0.0) + assert.Equal(t, ps.Full10, 0.0) +} + +func TestParseProcPressureMemory_ShortLine(t *testing.T) { + // Line with only one field. + data := `some +` + ps, err := parseProcPressureMemory([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 0.0) + assert.Equal(t, ps.Full10, 0.0) +} + +func TestParseProcMeminfo_AllFieldsPopulated(t *testing.T) { + data := []byte(`MemTotal: 16384000 kB +MemFree: 1024000 kB +MemAvailable: 8192000 kB +Buffers: 512000 kB +Cached: 4096000 kB +SwapTotal: 8192000 kB +SwapFree: 4096000 kB +AnonPages: 2048000 kB +`) + m, err := parseProcMeminfo(data) + assert.NilError(t, err) + assert.Equal(t, m.MemTotalBytes, uint64(16384000*1024)) + assert.Equal(t, m.MemAvailableBytes, uint64(8192000*1024)) + assert.Equal(t, m.MemCachedBytes, uint64(4096000*1024)) + assert.Equal(t, m.SwapTotalBytes, uint64(8192000*1024)) + assert.Equal(t, m.SwapFreeBytes, uint64(4096000*1024)) + assert.Equal(t, m.AnonRssBytes, uint64(2048000*1024)) +} + +func TestParseProcPressureMemory_BothMissing(t *testing.T) { + // Data with neither "some" nor "full" lines. + data := []byte("random_type avg10=1.23 avg60=0.50 avg300=0.25 total=12345\n") + ps, err := parseProcPressureMemory(data) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 0.0) + assert.Equal(t, ps.Full10, 0.0) +} + +func TestParseProcPressureMemory_OnlySome(t *testing.T) { + data := []byte("some avg10=5.50 avg60=3.20 avg300=1.10 total=999\n") + ps, err := parseProcPressureMemory(data) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 5.50) + assert.Equal(t, ps.Full10, 0.0) +} + +func TestParseProcPressureMemory_OnlyFull(t *testing.T) { + data := []byte("full avg10=2.30 avg60=1.50 avg300=0.80 total=555\n") + ps, err := parseProcPressureMemory(data) + assert.NilError(t, err) + assert.Equal(t, ps.Some10, 0.0) + assert.Equal(t, ps.Full10, 2.30) +} + +// --- /proc/vmstat parsing tests --- + +func TestParseProcVmstat(t *testing.T) { + data := `nr_free_pages 262144 +pswpin 1000 +pswpout 2500 +pgfault 500000 +pgmajfault 200 +oom_kill 3 +nr_dirty 50 +` + vs, err := parseProcVmstat([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, vs.pswpin, uint64(1000)) + assert.Equal(t, vs.pswpout, uint64(2500)) + assert.Equal(t, vs.pgfault, uint64(500000)) + assert.Equal(t, vs.oomKill, uint64(3)) +} + +func TestParseProcVmstat_Empty(t *testing.T) { + vs, err := parseProcVmstat([]byte("")) + assert.NilError(t, err) + assert.Equal(t, vs.pswpin, uint64(0)) + assert.Equal(t, vs.pswpout, uint64(0)) + assert.Equal(t, vs.pgfault, uint64(0)) + assert.Equal(t, vs.oomKill, uint64(0)) +} + +func TestParseProcVmstat_MalformedLines(t *testing.T) { + data := `pswpin notanumber +pswpout 100 +singleword +` + vs, err := parseProcVmstat([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, vs.pswpin, uint64(0)) // Malformed, skipped. + assert.Equal(t, vs.pswpout, uint64(100)) // Valid. +} + +func TestParseProcVmstat_MissingFields(t *testing.T) { + // Only oom_kill present; swap counters stay zero. + data := `oom_kill 5 +` + vs, err := parseProcVmstat([]byte(data)) + assert.NilError(t, err) + assert.Equal(t, vs.pswpin, uint64(0)) + assert.Equal(t, vs.oomKill, uint64(5)) +} + +// --- Collector tests --- + +func TestCollector_SwapRates(t *testing.T) { + c := NewCollector() + // First sample: sets baseline, rates are zero. + vs1 := vmstatCounters{pswpin: 100, pswpout: 200, pgfault: 1000} + now := time.Now() + c.computeVmstatDeltas(vs1, now) + assert.Equal(t, c.swapInBytesPerSec, 0.0) + assert.Equal(t, c.swapOutBytesPerSec, 0.0) + + // Second sample: 10 seconds later, pswpin increased by 50 pages. + vs2 := vmstatCounters{pswpin: 150, pswpout: 300, pgfault: 2000} + later := now.Add(10 * time.Second) + c.computeVmstatDeltas(vs2, later) + // 50 pages / 10 seconds * 4096 bytes/page = 20480 bytes/sec. + assert.Equal(t, c.swapInBytesPerSec, 50.0*4096.0/10.0) + // 100 pages / 10 seconds * 4096 bytes/page = 40960 bytes/sec. + assert.Equal(t, c.swapOutBytesPerSec, 100.0*4096.0/10.0) + // 1000 faults / 10 seconds = 100 faults/sec. + assert.Equal(t, c.pageFaultRate, 100.0) +} + +func TestCollector_SwapRates_ZeroDuration(t *testing.T) { + c := NewCollector() + vs := vmstatCounters{pswpin: 100, pswpout: 200} + now := time.Now() + c.computeVmstatDeltas(vs, now) + // Same timestamp — should not divide by zero, rates stay at zero. + c.computeVmstatDeltas(vs, now) + assert.Equal(t, c.swapInBytesPerSec, 0.0) +} + +func TestCollector_OomDetected(t *testing.T) { + c := NewCollector() + now := time.Now() + // First sample: baseline oom_kill=0. + vs1 := vmstatCounters{oomKill: 0} + c.computeVmstatDeltas(vs1, now) + assert.Equal(t, c.oomDetected, false) + + // Second sample: oom_kill increased to 1. + vs2 := vmstatCounters{oomKill: 1} + c.computeVmstatDeltas(vs2, now.Add(10*time.Second)) + assert.Equal(t, c.oomDetected, true) + + // Read clears the flag (edge-triggered). + assert.Equal(t, c.consumeOomDetected(), true) + assert.Equal(t, c.oomDetected, false) + + // Third sample: oom_kill unchanged — no new OOM. + vs3 := vmstatCounters{oomKill: 1} + c.computeVmstatDeltas(vs3, now.Add(20*time.Second)) + assert.Equal(t, c.oomDetected, false) +} + +func TestCollector_CounterWrap(t *testing.T) { + // If counters decrease (e.g., system reboot), treat as reset. + c := NewCollector() + now := time.Now() + vs1 := vmstatCounters{pswpin: 1000, pswpout: 2000, pgfault: 5000, oomKill: 2} + c.computeVmstatDeltas(vs1, now) + + // Counter decreased — treat delta as zero, not negative. + vs2 := vmstatCounters{pswpin: 500, pswpout: 800, pgfault: 100, oomKill: 0} + c.computeVmstatDeltas(vs2, now.Add(10*time.Second)) + assert.Equal(t, c.swapInBytesPerSec, 0.0) + assert.Equal(t, c.swapOutBytesPerSec, 0.0) + assert.Equal(t, c.pageFaultRate, 0.0) + assert.Equal(t, c.oomDetected, false) +} + +// --- Cgroup delta computation tests --- + +func TestCollector_CgroupDeltas_FirstSample(t *testing.T) { + c := NewCollector() + // First call has no previous data — rates should be zero. + paths := []string{"/sys/fs/cgroup/system.slice/docker-abc.scope"} + // Inject test data by priming the maps (simulate cgroupCPUUsage/cgroupIOBytes calls). + // Since we can't inject real cgroup paths in unit tests, test the delta + // computation directly by pre-populating prevCgroupCPU/IO. + now := time.Now() + cpu, io := c.computeCgroupDeltas(paths, now) + // First sample: no previous data, rates are zero. + // (cgroupCPUUsage will fail since paths don't exist, so nothing is recorded.) + assert.Equal(t, cpu, 0.0) + assert.Equal(t, io, 0.0) +} + +func TestSafeDelta(t *testing.T) { + assert.Equal(t, safeDelta(100, 50), uint64(50)) + assert.Equal(t, safeDelta(50, 100), uint64(0)) + assert.Equal(t, safeDelta(100, 100), uint64(0)) +} diff --git a/pkg/guestagent/metrics/metrics_other.go b/pkg/guestagent/metrics/metrics_other.go new file mode 100644 index 00000000000..48eb03b0526 --- /dev/null +++ b/pkg/guestagent/metrics/metrics_other.go @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build !linux + +// Package metrics reads guest memory statistics from /proc for the balloon controller. +// On non-Linux platforms, all functions return errors since /proc is not available. +package metrics + +import ( + "context" + "errors" + + "github.com/lima-vm/lima/v2/pkg/guestagent/api" +) + +// Collector is a stateful memory metrics collector. On non-Linux platforms +// it is a no-op stub. +type Collector struct{} + +// NewCollector creates a no-op collector on non-Linux platforms. +func NewCollector() *Collector { + return &Collector{} +} + +// Collect returns an error on non-Linux platforms. +func (c *Collector) Collect(_ context.Context) (*api.MemoryMetrics, error) { + return nil, errors.New("memory metrics collection requires Linux") +} + +// CollectMemoryMetrics returns an error on non-Linux platforms. +func CollectMemoryMetrics() (*api.MemoryMetrics, error) { + return nil, errors.New("memory metrics collection requires Linux") +} diff --git a/pkg/guestagent/metrics/vmstat_linux.go b/pkg/guestagent/metrics/vmstat_linux.go new file mode 100644 index 00000000000..9baf5ea2a12 --- /dev/null +++ b/pkg/guestagent/metrics/vmstat_linux.go @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package metrics + +import ( + "bufio" + "bytes" + "strconv" + "strings" +) + +// vmstatCounters holds the cumulative counters from /proc/vmstat +// that are relevant for the balloon controller. +type vmstatCounters struct { + pswpin uint64 // Pages swapped in (cumulative). + pswpout uint64 // Pages swapped out (cumulative). + pgfault uint64 // Page faults (cumulative, major + minor). + oomKill uint64 // OOM kills (cumulative). +} + +// parseProcVmstat extracts swap and OOM counters from /proc/vmstat data. +func parseProcVmstat(data []byte) (vmstatCounters, error) { + var vs vmstatCounters + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + val, err := strconv.ParseUint(parts[1], 10, 64) + if err != nil { + continue + } + switch parts[0] { + case "pswpin": + vs.pswpin = val + case "pswpout": + vs.pswpout = val + case "pgfault": + vs.pgfault = val + case "oom_kill": + vs.oomKill = val + } + } + return vs, scanner.Err() +} From 6b6434673811e0bd5cb73e279364bce9759ffddd Mon Sep 17 00:00:00 2001 From: "Jason W. Ehrlich" Date: Fri, 17 Apr 2026 13:51:09 -0400 Subject: [PATCH 3/4] hostagent: add balloon controller state machine Add a BalloonController that manages VM memory allocation through a six-state machine: Bootstrap, LearningDescend, Steady, OOMRecovery, CircuitBreaker, and AgentFailure. The controller uses guest metrics (PSI pressure, swap rates, container activity) and host pressure (macOS memory_pressure) to decide when to grow or shrink the balloon. Key behaviors: - OOM detection triggers immediate 20% grow with circuit breaker - PSI-based pressure monitoring with configurable thresholds - MemAvailable heuristic when PSI is unavailable - Shrink guards: swap-in rate, container CPU/IO, page faults - Learned floor persisted to disk across instance restarts - Host pressure integration prevents shrinking under host memory stress - Cooldown enforcement between balloon actions Include comprehensive test coverage (~1300 lines) for all states, transitions, edge cases, and guard conditions. Signed-off-by: Jason W. Ehrlich --- pkg/driver/driver.go | 6 + pkg/hostagent/balloon_controller.go | 581 +++++++++ pkg/hostagent/balloon_controller_test.go | 1365 ++++++++++++++++++++ pkg/hostagent/host_pressure.go | 26 + pkg/hostagent/host_pressure_darwin.go | 85 ++ pkg/hostagent/host_pressure_darwin_test.go | 101 ++ pkg/hostagent/host_pressure_other.go | 20 + pkg/hostagent/host_pressure_test.go | 33 + pkg/hostagent/hostagent.go | 176 +++ pkg/store/learned_floor.go | 58 + pkg/store/learned_floor_test.go | 107 ++ 11 files changed, 2558 insertions(+) create mode 100644 pkg/hostagent/balloon_controller.go create mode 100644 pkg/hostagent/balloon_controller_test.go create mode 100644 pkg/hostagent/host_pressure.go create mode 100644 pkg/hostagent/host_pressure_darwin.go create mode 100644 pkg/hostagent/host_pressure_darwin_test.go create mode 100644 pkg/hostagent/host_pressure_other.go create mode 100644 pkg/hostagent/host_pressure_test.go create mode 100644 pkg/store/learned_floor.go create mode 100644 pkg/store/learned_floor_test.go diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index f0ee529c1e3..80ac7bd2467 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -77,6 +77,12 @@ type GuestAgent interface { GuestAgentConn(_ context.Context) (net.Conn, string, error) } +// Ballooner is an optional interface for drivers that support memory ballooning. +// Use type assertion to check if a driver implements this interface. +type Ballooner interface { + SetBalloonTarget(targetBytes uint64) error +} + // Driver interface is used by hostagent for managing vm. type Driver interface { Lifecycle diff --git a/pkg/hostagent/balloon_controller.go b/pkg/hostagent/balloon_controller.go new file mode 100644 index 00000000000..1d83431c1e4 --- /dev/null +++ b/pkg/hostagent/balloon_controller.go @@ -0,0 +1,581 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +package hostagent + +import ( + "sync" + "time" + + "github.com/docker/go-units" + "github.com/sirupsen/logrus" + + "github.com/lima-vm/lima/v2/pkg/guestagent/api" + "github.com/lima-vm/lima/v2/pkg/store" +) + +const ( + learnConfirmDuration = 5 * time.Minute // How long to confirm a candidate floor. + learnMaxConfirmFails = 3 // Max failed confirmations before fallback. +) + +// BalloonState represents the state machine state of the balloon controller. +type BalloonState string + +const ( + BalloonStateBootstrap BalloonState = "bootstrap" + BalloonStateSteady BalloonState = "steady" + BalloonStateAgentFailure BalloonState = "agent-failure" + BalloonStateCircuitBreaker BalloonState = "circuit-breaker" + BalloonStateShutdown BalloonState = "shutdown" + BalloonStateLearningDescend BalloonState = "learning-descend" + BalloonStateLearningConfirm BalloonState = "learning-confirm" +) + +// BalloonActionType describes what the controller wants to do. +type BalloonActionType string + +const ( + BalloonActionNone BalloonActionType = "none" + BalloonActionGrow BalloonActionType = "grow" + BalloonActionShrink BalloonActionType = "shrink" +) + +// BalloonAction is the output of the controller's evaluation. +type BalloonAction struct { + Type BalloonActionType + TargetBytes uint64 + Reason string +} + +// BalloonConfig holds the configuration for the balloon controller. +type BalloonConfig struct { + MaxMemoryBytes uint64 + MinBytes uint64 + IdleTargetBytes uint64 + GrowStepPercent int + ShrinkStepPercent int + HighPressureThreshold float64 + LowPressureThreshold float64 + Cooldown time.Duration + IdleGracePeriod time.Duration + MaxSwapInPerSec uint64 + MaxSwapOutPerSec uint64 // Swap-out rate that blocks shrinking (bytes/sec). + MaxPageFaultRate uint64 // Page-fault rate that blocks shrinking (faults/sec). + ShrinkReserveBytes uint64 // Minimum MemAvailable margin required before shrinking (bytes). + SettleWindow time.Duration // Sustained low pressure required before shrink. + MaxContainerCPU float64 + MaxContainerIO uint64 + FloorStaleness time.Duration // Max age of learned floor before re-learning (0 = never stale). +} + +// BalloonController implements the balloon state machine. +// It is NOT safe for concurrent use except for SetHostPressure, which is +// protected by a mutex. All other methods must be called from a single goroutine +// (typically the balloon polling loop in hostagent). +type BalloonController struct { + cfg BalloonConfig + state BalloonState + currentBytes uint64 + + mu sync.Mutex // Protects hostPressure for concurrent access. + hostPressure HostPressure + lastActionTime time.Time + lowPressureSince time.Time // When sustained low pressure started (zero = not settled). + pollFailures int + oomTimes []time.Time + circuitBreakerT time.Time + + // E5: Learned stable floor fields. + learnedFloor uint64 // Discovered stable floor (bytes); 0 = not yet learned. + learnedAt time.Time // When the floor was learned (zero = unknown/stale). + candidateFloor uint64 // Candidate floor being confirmed. + confirmStartTime time.Time // When confirmation started. + confirmFails int // Number of failed confirmations at current candidate. + instDir string // Instance directory for persisting learned floor. + + // E10-1: PSI availability tracking. + psiAvailable bool // Set true on first non-zero PSI reading. +} + +// NewBalloonController creates a controller starting in bootstrap state at max memory. +func NewBalloonController(cfg BalloonConfig) *BalloonController { + return &BalloonController{ + cfg: cfg, + state: BalloonStateBootstrap, + currentBytes: cfg.MaxMemoryBytes, + } +} + +// effectiveFloor returns the learned floor if it is still fresh, or 0 if stale/unset. +func (c *BalloonController) effectiveFloor() uint64 { + if c.learnedFloor == 0 { + return 0 + } + if c.cfg.FloorStaleness > 0 && !c.learnedAt.IsZero() && time.Since(c.learnedAt) > c.cfg.FloorStaleness { + logrus.Infof("balloon: learned floor %d bytes is stale (age %s > %s), discarding", + c.learnedFloor, time.Since(c.learnedAt).Round(time.Second), c.cfg.FloorStaleness) + c.learnedFloor = 0 + c.learnedAt = time.Time{} + return 0 + } + return c.learnedFloor +} + +// TransitionTo changes the controller state. +func (c *BalloonController) TransitionTo(state BalloonState) { + logrus.Debugf("balloon: state %s -> %s", c.state, state) + c.state = state +} + +// SetHostPressure updates the host memory pressure level (thread-safe). +func (c *BalloonController) SetHostPressure(p HostPressure) { + c.mu.Lock() + defer c.mu.Unlock() + c.hostPressure = p +} + +// RecordAction records that an action was taken at the given time. +func (c *BalloonController) RecordAction(action BalloonAction, t time.Time) { + c.lastActionTime = t + if action.Type != BalloonActionNone { + c.currentBytes = action.TargetBytes + } +} + +// RecordOOM records an OOM event for circuit breaker tracking. +func (c *BalloonController) RecordOOM(t time.Time) { + c.lowPressureSince = time.Time{} // Reset settle window on OOM. + c.learnedFloor = 0 // Reset learned floor — workload may have changed. + c.oomTimes = append(c.oomTimes, t) + // Keep only OOMs in the last 10 minutes. + cutoff := t.Add(-10 * time.Minute) + filtered := c.oomTimes[:0] + for _, oomT := range c.oomTimes { + if oomT.After(cutoff) { + filtered = append(filtered, oomT) + } + } + c.oomTimes = filtered + + // Circuit breaker: 3+ OOMs in 10 minutes. + if len(c.oomTimes) >= 3 { + c.state = BalloonStateCircuitBreaker + c.circuitBreakerT = t + c.currentBytes = c.cfg.MaxMemoryBytes + logrus.Warnf("balloon: circuit breaker triggered (%d OOMs in 10 min), locked at max", len(c.oomTimes)) + } +} + +// RecordPollFailure records a failed metrics poll with graduated response. +// 1-2 failures: transient, no action. 3-5: grow by 50% of headroom. +// 6+: full expansion to max, enter agent failure state. +// Returns a non-nil action when memory should be expanded. +func (c *BalloonController) RecordPollFailure() *BalloonAction { + c.pollFailures++ + switch { + case c.pollFailures >= 6: + // 60s of failures — full expansion, enter agent failure state. + if c.state != BalloonStateAgentFailure { + c.TransitionTo(BalloonStateAgentFailure) + c.currentBytes = c.cfg.MaxMemoryBytes + logrus.Warn("balloon: 6 poll failures, expanding to max memory") + return &BalloonAction{Type: BalloonActionGrow, TargetBytes: c.currentBytes, Reason: "agent failure"} + } + case c.pollFailures >= 3: + // 30s of failures — grow by 50% of remaining headroom. + headroom := c.cfg.MaxMemoryBytes - c.currentBytes + if headroom > 0 { + grow := headroom / 2 + c.currentBytes = min(c.currentBytes+grow, c.cfg.MaxMemoryBytes) + logrus.Warnf("balloon: %d poll failures, growing to %s", + c.pollFailures, units.BytesSize(float64(c.currentBytes))) + return &BalloonAction{Type: BalloonActionGrow, TargetBytes: c.currentBytes, Reason: "poll failure recovery"} + } + } + // 1-2 failures: do nothing (transient). + return nil +} + +// RecordPollSuccess resets the poll failure counter and recovers from agent failure. +func (c *BalloonController) RecordPollSuccess() { + if c.pollFailures > 0 { + c.pollFailures = 0 + if c.state == BalloonStateAgentFailure { + c.TransitionTo(BalloonStateSteady) + logrus.Info("balloon: agent recovered, resuming steady state") + } + } +} + +// PrepareShutdown grows memory to max before VM stop. +func (c *BalloonController) PrepareShutdown() BalloonAction { + c.state = BalloonStateShutdown + return BalloonAction{ + Type: BalloonActionGrow, + TargetBytes: c.cfg.MaxMemoryBytes, + Reason: "graceful shutdown", + } +} + +// Evaluate examines metrics and returns the balloon action to take. +// bootTime is when the VM started (used for idle grace period). +func (c *BalloonController) Evaluate(m *api.MemoryMetrics, bootTime time.Time) BalloonAction { + none := BalloonAction{Type: BalloonActionNone, TargetBytes: c.currentBytes} + + if m == nil { + return none + } + + switch c.state { + case BalloonStateBootstrap: + return none + case BalloonStateCircuitBreaker: + // Stay at max until circuit breaker timeout (30 min). + if time.Since(c.circuitBreakerT) > 30*time.Minute { + c.state = BalloonStateSteady + c.oomTimes = nil + logrus.Info("balloon: circuit breaker reset") + } + return none + case BalloonStateAgentFailure: + return none + case BalloonStateShutdown: + return none + case BalloonStateLearningDescend: + return c.evaluateLearningDescend(m, none) + case BalloonStateLearningConfirm: + return c.evaluateLearningConfirm(m, none) + } + + // OOM handling — immediate grow, no cooldown. + if m.OomDetected { + target := min(c.currentBytes+c.currentBytes/5, c.cfg.MaxMemoryBytes) + c.RecordOOM(time.Now()) + return BalloonAction{ + Type: BalloonActionGrow, + TargetBytes: target, + Reason: "OOM detected", + } + } + + // E10-1: Track PSI availability. Set true on first non-zero reading. + if !c.psiAvailable { + if m.PsiMemorySome_10 > 0 || m.PsiMemoryFull_10 > 0 { + c.psiAvailable = true + } + } + + // E10-1: PSI unavailable guard — prevent aggressive shrinking without PSI data. + // When PSI returns all zeros (disabled kernel or first boot), fall back to + // MemAvailable-only heuristic: block shrink if MemAvailable < 40% of currentBytes, + // and trigger grow if MemAvailable < 15% (critical pressure without PSI). + if !c.psiAvailable { + if m.MemAvailableBytes < c.currentBytes*15/100 { + // Critical: MemAvailable < 15% of current balloon — grow immediately. + // Use 3× cooldown to avoid oscillation: after a shrink, the kernel needs + // time to stabilize MemAvailable before we evaluate pressure again. + noPSICooldown := c.cfg.Cooldown * 3 + if !c.lastActionTime.IsZero() && time.Since(c.lastActionTime) < noPSICooldown { + return none + } + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.GrowStepPercent) / 100 + target := min(c.currentBytes+step, c.cfg.MaxMemoryBytes) + if target > c.currentBytes { + return BalloonAction{ + Type: BalloonActionGrow, + TargetBytes: target, + Reason: "low MemAvailable (no PSI)", + } + } + return none // Already at max, nothing to do. + } + if m.MemAvailableBytes < c.currentBytes*40/100 { + return none // Block shrink but no grow needed yet. + } + } + + // E4: Host pressure modifiers — checked BEFORE E1 guest distress. + c.mu.Lock() + hp := c.hostPressure + c.mu.Unlock() + switch hp { + case HostPressureCritical: + if m.PsiMemoryFull_10 > 5.0 { + // Severe guest distress even by host-critical standards — hold steady. + if m.PsiMemorySome_10 < c.cfg.HighPressureThreshold { + return none + } + // some >= high → fall through to grow path. + } else if time.Since(c.lastActionTime) >= c.cfg.Cooldown { + // Guest not severely distressed — aggressive 2× shrink. + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.ShrinkStepPercent) * 2 / 100 + var target uint64 + if c.currentBytes > step { + target = max(c.cfg.MinBytes, c.currentBytes-step) + } else { + target = c.cfg.MinBytes + } + return BalloonAction{ + Type: BalloonActionShrink, TargetBytes: target, + Reason: "host critical pressure", + } + } + case HostPressureWarning: + // Bypass settle window and E2 guards; cooldown still applies. + guestDistressed := m.PsiMemoryFull_10 > 0 + if !guestDistressed && m.PsiMemorySome_10 < c.cfg.HighPressureThreshold { + if time.Since(c.lastActionTime) >= c.cfg.Cooldown { + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.ShrinkStepPercent) / 100 + var target uint64 + if c.currentBytes > step { + target = max(c.cfg.MinBytes, c.currentBytes-step) + } else { + target = c.cfg.MinBytes + } + return BalloonAction{ + Type: BalloonActionShrink, TargetBytes: target, + Reason: "host warning pressure", + } + } + } + case HostPressureNormal: + // Existing behavior — settle window and cooldown apply. + } + + // E1: Guest distress detection — only under Normal/Warning. + // Under Critical, already handled above with relaxed full > 5.0 threshold. + if hp == HostPressureNormal || hp == HostPressureWarning { + if m.PsiMemoryFull_10 > 0 { + if m.PsiMemorySome_10 < c.cfg.HighPressureThreshold { + return none + } + } + } + + // E3: Reset settle window when pressure rises above low threshold. + if m.PsiMemorySome_10 >= c.cfg.LowPressureThreshold { + c.lowPressureSince = time.Time{} + } + + // High pressure — grow immediately (no cooldown for grow). + if m.PsiMemorySome_10 >= c.cfg.HighPressureThreshold { + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.GrowStepPercent) / 100 + if m.PsiMemoryFull_10 > 0 { + step = step * 3 / 2 // 1.5× step when all tasks are stalled. + } + target := min(c.currentBytes+step, c.cfg.MaxMemoryBytes) + return BalloonAction{ + Type: BalloonActionGrow, + TargetBytes: target, + Reason: "high memory pressure", + } + } + + // Low pressure — consider shrinking. + if m.PsiMemorySome_10 < c.cfg.LowPressureThreshold { + // Check cooldown. + if !c.lastActionTime.IsZero() && time.Since(c.lastActionTime) < c.cfg.Cooldown { + return none + } + + // Check idle grace period. + if time.Since(bootTime) < c.cfg.IdleGracePeriod { + return none + } + + // E3: Check settle window — require sustained low pressure before shrinking. + if c.cfg.SettleWindow > 0 { + if c.lowPressureSince.IsZero() { + c.lowPressureSince = time.Now() + return none // Just started settling. + } + if time.Since(c.lowPressureSince) < c.cfg.SettleWindow { + return none // Still settling. + } + } + + // Check swap activity guard. + if c.cfg.MaxSwapInPerSec > 0 && m.SwapInBytesPerSec > float64(c.cfg.MaxSwapInPerSec) { + return none + } + + // E2: Check swap-out activity guard. + if c.cfg.MaxSwapOutPerSec > 0 && m.SwapOutBytesPerSec > float64(c.cfg.MaxSwapOutPerSec) { + return none + } + // E2: Check page-fault rate guard. + if c.cfg.MaxPageFaultRate > 0 && m.PageFaultRate > float64(c.cfg.MaxPageFaultRate) { + return none + } + // E2: Check MemAvailable reserve — do not shrink if available memory is too thin. + if c.cfg.ShrinkReserveBytes > 0 { + shrinkStep := c.cfg.MaxMemoryBytes * uint64(c.cfg.ShrinkStepPercent) / 100 + if m.MemAvailableBytes < c.cfg.ShrinkReserveBytes+shrinkStep { + return none + } + } + + // Check container activity guards (skip if no containers). + if m.ContainerCount > 0 { + if c.cfg.MaxContainerCPU > 0 && m.ContainerCpuPercent > c.cfg.MaxContainerCPU { + return none + } + if c.cfg.MaxContainerIO > 0 && m.ContainerIoBytesPerSec > float64(c.cfg.MaxContainerIO) { + return none + } + } + + // Compute shrink target (multiply before divide to avoid truncation to zero). + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.ShrinkStepPercent) / 100 + target := c.currentBytes + if step < target { + target -= step + } else { + target = c.cfg.MinBytes + } + + // E10-8: Adaptive AnonRss margin based on container activity. + var marginPct uint64 + switch { + case m.ContainerCount == 0: + marginPct = 5 // Idle: kernel + system processes only. + case m.ContainerCount <= 5: + marginPct = 15 // Light workload. + default: + marginPct = 20 // Heavy workload: more headroom. + } + // Apply hard floor: max(min, anon_rss * (1+margin), effectiveFloor). + floor := c.effectiveFloor() + hardFloor := max(m.AnonRssBytes+m.AnonRssBytes*marginPct/100, c.cfg.MinBytes, floor) + if target < hardFloor { + target = hardFloor + } + + // Cap at idleTarget if we're above it. + if target > c.cfg.IdleTargetBytes && c.currentBytes > c.cfg.IdleTargetBytes { + target = c.cfg.IdleTargetBytes + } else if c.currentBytes <= c.cfg.IdleTargetBytes { + return none // Already at or below idle target. + } + + // Apply hard floor AFTER idleTarget cap so floor always wins. + if target < hardFloor { + target = hardFloor + } + + // Only shrink if target is actually less than current. + if target >= c.currentBytes { + return none + } + + // No-PSI safety: prevent shrink if it would push MemAvailable below the + // critical grow threshold (15%). Without PSI, this is the only signal we + // have, and we must avoid shrink→grow oscillation. + if !c.psiAvailable { + shrinkAmount := c.currentBytes - target + if shrinkAmount > m.MemAvailableBytes { + return none // Shrink would consume more than all available memory. + } + expectedAvail := m.MemAvailableBytes - shrinkAmount + if expectedAvail < target*20/100 { + return none // Would leave < 20% headroom, risking oscillation. + } + } + + // E3: Reset settle window so consecutive shrinks each require a fresh window. + c.lowPressureSince = time.Time{} + + // E5: After first settle-window shrink, start learning the stable floor. + if c.effectiveFloor() == 0 && m.ContainerCount == 0 { + c.TransitionTo(BalloonStateLearningDescend) + } + + return BalloonAction{ + Type: BalloonActionShrink, + TargetBytes: target, + Reason: "low pressure shrink", + } + } + + return none +} + +// evaluateLearningDescend handles the descend phase of floor learning. +func (c *BalloonController) evaluateLearningDescend(m *api.MemoryMetrics, none BalloonAction) BalloonAction { + // Suspend descend under host pressure Warning/Critical. + c.mu.Lock() + hp := c.hostPressure + c.mu.Unlock() + if hp != HostPressureNormal { + return none + } + + // Detect instability: PSI spike, full stall, or excessive swap-out. + unstable := m.PsiMemorySome_10 >= c.cfg.HighPressureThreshold || m.PsiMemoryFull_10 > 0 || + (c.cfg.MaxSwapOutPerSec > 0 && m.SwapOutBytesPerSec > float64(c.cfg.MaxSwapOutPerSec)) + if unstable { + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.ShrinkStepPercent) / 100 + c.candidateFloor = min(c.currentBytes+step, c.cfg.IdleTargetBytes) + c.confirmStartTime = time.Now() + c.confirmFails = 0 + c.TransitionTo(BalloonStateLearningConfirm) + return BalloonAction{ + Type: BalloonActionGrow, TargetBytes: c.candidateFloor, + Reason: "learning: instability detected, confirming floor", + } + } + + if time.Since(c.lastActionTime) >= c.cfg.Cooldown { + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.ShrinkStepPercent) / 100 + if c.currentBytes <= c.cfg.MinBytes+step { + // Reached min with no instability — set floor just above min. + c.learnedFloor = c.cfg.MinBytes + step + c.learnedAt = time.Now() + c.TransitionTo(BalloonStateSteady) + _ = store.WriteLearnedFloor(c.instDir, c.learnedFloor, c.learnedAt) + logrus.Infof("balloon: learned floor at min boundary: %d bytes", c.learnedFloor) + return none + } + target := c.currentBytes - step + return BalloonAction{ + Type: BalloonActionShrink, TargetBytes: target, + Reason: "learning: descending to find floor", + } + } + return none +} + +// evaluateLearningConfirm handles the confirm phase of floor learning. +func (c *BalloonController) evaluateLearningConfirm(m *api.MemoryMetrics, none BalloonAction) BalloonAction { + // Detect instability using same signals as descend. + unstable := m.PsiMemorySome_10 >= c.cfg.HighPressureThreshold || m.PsiMemoryFull_10 > 0 || + (c.cfg.MaxSwapOutPerSec > 0 && m.SwapOutBytesPerSec > float64(c.cfg.MaxSwapOutPerSec)) + if unstable { + c.confirmFails++ + if c.confirmFails >= learnMaxConfirmFails { + c.learnedFloor = c.cfg.IdleTargetBytes // Conservative fallback. + c.learnedAt = time.Now() + c.TransitionTo(BalloonStateSteady) + _ = store.WriteLearnedFloor(c.instDir, c.learnedFloor, c.learnedAt) + logrus.Infof("balloon: learning failed %d times, using idleTarget as floor", c.confirmFails) + return none + } + step := c.cfg.MaxMemoryBytes * uint64(c.cfg.ShrinkStepPercent) / 100 + c.candidateFloor = min(c.candidateFloor+step, c.cfg.IdleTargetBytes) + c.confirmStartTime = time.Now() + return BalloonAction{ + Type: BalloonActionGrow, TargetBytes: c.candidateFloor, + Reason: "learning: raising candidate after instability", + } + } + if time.Since(c.confirmStartTime) >= learnConfirmDuration { + c.learnedFloor = c.candidateFloor + c.learnedAt = time.Now() + c.TransitionTo(BalloonStateSteady) + _ = store.WriteLearnedFloor(c.instDir, c.learnedFloor, c.learnedAt) + logrus.Infof("balloon: learned stable floor at %d bytes", c.learnedFloor) + return none + } + return none +} diff --git a/pkg/hostagent/balloon_controller_test.go b/pkg/hostagent/balloon_controller_test.go new file mode 100644 index 00000000000..5b0b26f6da6 --- /dev/null +++ b/pkg/hostagent/balloon_controller_test.go @@ -0,0 +1,1365 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +package hostagent + +import ( + "strings" + "testing" + "time" + + "gotest.tools/v3/assert" + + "github.com/lima-vm/lima/v2/pkg/guestagent/api" + "github.com/lima-vm/lima/v2/pkg/store" +) + +func newTestConfig() BalloonConfig { + return BalloonConfig{ + MaxMemoryBytes: 12 * 1024 * 1024 * 1024, // 12 GiB + MinBytes: 3 * 1024 * 1024 * 1024, // 3 GiB + IdleTargetBytes: 4 * 1024 * 1024 * 1024, // 4 GiB + GrowStepPercent: 25, + ShrinkStepPercent: 10, + HighPressureThreshold: 0.88, + LowPressureThreshold: 0.35, + Cooldown: 30 * time.Second, + IdleGracePeriod: 5 * time.Minute, + } +} + +func idleMetrics() *api.MemoryMetrics { + return &api.MemoryMetrics{ + MemTotalBytes: 12 * 1024 * 1024 * 1024, + MemAvailableBytes: 10 * 1024 * 1024 * 1024, + PsiMemorySome_10: 0.1, + PsiMemoryFull_10: 0.0, + AnonRssBytes: 1 * 1024 * 1024 * 1024, + ContainerCount: 0, + } +} + +func pressureMetrics() *api.MemoryMetrics { + return &api.MemoryMetrics{ + MemTotalBytes: 12 * 1024 * 1024 * 1024, + MemAvailableBytes: 1 * 1024 * 1024 * 1024, + PsiMemorySome_10: 0.95, + PsiMemoryFull_10: 0.80, + AnonRssBytes: 10 * 1024 * 1024 * 1024, + ContainerCount: 5, + } +} + +func TestBalloonController_IdleShrink(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + + // After bootstrap, feed idle metrics. Current allocation should trend toward idleTarget. + ctrl.TransitionTo(BalloonStateSteady) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Assert(t, action.TargetBytes <= cfg.IdleTargetBytes) + assert.Assert(t, action.TargetBytes >= cfg.MinBytes) +} + +func TestBalloonController_NoOscillation(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + // Grow due to pressure. + action := ctrl.Evaluate(pressureMetrics(), time.Now()) + assert.Equal(t, action.Type, BalloonActionGrow) + + // Immediately after grow, idle metrics should NOT trigger shrink (cooldown). + ctrl.RecordAction(action, time.Now()) + action2 := ctrl.Evaluate(idleMetrics(), time.Now()) + assert.Equal(t, action2.Type, BalloonActionNone) +} + +func TestBalloonController_FastGrow(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // High pressure should trigger immediate grow. + action := ctrl.Evaluate(pressureMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Assert(t, action.TargetBytes > cfg.IdleTargetBytes) +} + +func TestBalloonController_OOMRecovery(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 6 * 1024 * 1024 * 1024 + + // OOM detected should grow to previous + 20%. + oomMetrics := pressureMetrics() + oomMetrics.OomDetected = true + action := ctrl.Evaluate(oomMetrics, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + expectedMin := uint64(float64(ctrl.currentBytes) * 1.20) + assert.Assert(t, action.TargetBytes >= expectedMin) +} + +func TestBalloonController_CircuitBreaker(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + // 3 OOMs in 10 minutes should lock at max. + // Note: Evaluate() calls RecordOOM() internally when OomDetected is true. + now := time.Now() + for i := range 3 { + oomMetrics := pressureMetrics() + oomMetrics.OomDetected = true + action := ctrl.Evaluate(oomMetrics, now.Add(-6*time.Minute)) + ctrl.RecordAction(action, now.Add(time.Duration(i)*time.Minute)) + } + assert.Equal(t, ctrl.state, BalloonStateCircuitBreaker) + assert.Equal(t, ctrl.currentBytes, cfg.MaxMemoryBytes) +} + +func TestBalloonController_HardFloor(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 10 * 1024 * 1024 * 1024 // 10 GiB — well above idle target + + // Hard floor = max(min, anon_rss * margin). + // With 0 containers, adaptive margin is 5%. + metrics := idleMetrics() + metrics.AnonRssBytes = 4 * 1024 * 1024 * 1024 // 4 GiB anon RSS + action := ctrl.Evaluate(metrics, time.Now().Add(-6*time.Minute)) + + hardFloor := max(uint64(float64(metrics.AnonRssBytes)*1.05), cfg.MinBytes) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Assert(t, action.TargetBytes >= hardFloor) +} + +func TestBalloonController_GracefulShutdown(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // Shutdown should grow to max. + action := ctrl.PrepareShutdown() + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Equal(t, action.TargetBytes, cfg.MaxMemoryBytes) +} + +func TestBalloonController_BootstrapTimeout(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + + // In bootstrap state, should stay at max. + assert.Equal(t, ctrl.state, BalloonStateBootstrap) + assert.Equal(t, ctrl.currentBytes, cfg.MaxMemoryBytes) +} + +func TestBalloonController_AgentFailure(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // 6 consecutive poll failures should expand to max (graduated: 3=partial, 6=full). + for range 6 { + ctrl.RecordPollFailure() + } + assert.Equal(t, ctrl.state, BalloonStateAgentFailure) + assert.Equal(t, ctrl.currentBytes, cfg.MaxMemoryBytes) +} + +// --- Edge case tests --- + +func TestBalloonController_ZeroMetrics(t *testing.T) { + // All-zero metrics should not panic or produce invalid actions. + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + m := &api.MemoryMetrics{} // All zeros. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + // Zero PSI means psiAvailable=false, and MemAvailable=0 < 15% of currentBytes. + // The no-PSI fallback would trigger grow, but currentBytes == MaxMemoryBytes, + // so grow is capped and returns None instead. + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_PressureExactlyAtThreshold(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // PSI exactly at high threshold should trigger grow. + m := idleMetrics() + m.PsiMemorySome_10 = cfg.HighPressureThreshold + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) +} + +func TestBalloonController_PressureJustBelowHigh(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // PSI just below high threshold: no grow, no shrink (in between thresholds). + m := idleMetrics() + m.PsiMemorySome_10 = cfg.HighPressureThreshold - 0.01 + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + // Between low and high → no action. + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_PressureExactlyAtLow(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + // PSI exactly at low threshold should NOT trigger shrink (< not <=). + m := idleMetrics() + m.PsiMemorySome_10 = cfg.LowPressureThreshold + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_AlreadyAtMin(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.MinBytes // Already at min. + + m := idleMetrics() + m.AnonRssBytes = 1 * 1024 * 1024 * 1024 // 1 GiB anon RSS. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + // currentBytes <= idleTarget → return none (already at or below idle target). + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_AlreadyAtMax(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.MaxMemoryBytes + + // Grow when already at max should cap at max. + m := pressureMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Equal(t, action.TargetBytes, cfg.MaxMemoryBytes) +} + +func TestBalloonController_OOMAtMax(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.MaxMemoryBytes + + // OOM when already at max: target is 120% of max, capped to max. + m := pressureMetrics() + m.OomDetected = true + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Equal(t, action.TargetBytes, cfg.MaxMemoryBytes) +} + +func TestBalloonController_CircuitBreakerRecovery(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + // Trigger circuit breaker via Evaluate (matches production code path). + now := time.Now() + for i := range 3 { + oomMetrics := pressureMetrics() + oomMetrics.OomDetected = true + action := ctrl.Evaluate(oomMetrics, now.Add(-6*time.Minute)) + ctrl.RecordAction(action, now.Add(time.Duration(i)*time.Minute)) + } + assert.Equal(t, ctrl.state, BalloonStateCircuitBreaker) + + // Before 30 min, should stay locked. + ctrl.circuitBreakerT = now.Add(-29 * time.Minute) + action := ctrl.Evaluate(idleMetrics(), now.Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) + assert.Equal(t, ctrl.state, BalloonStateCircuitBreaker) + + // After 30 min, should recover to steady. + ctrl.circuitBreakerT = now.Add(-31 * time.Minute) + action = ctrl.Evaluate(idleMetrics(), now.Add(-6*time.Minute)) + assert.Equal(t, ctrl.state, BalloonStateSteady) +} + +func TestBalloonController_CooldownExactlyExpired(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + // Record action exactly cooldown ago. + ctrl.RecordAction(BalloonAction{Type: BalloonActionShrink, TargetBytes: 8 * 1024 * 1024 * 1024}, time.Now().Add(-cfg.Cooldown)) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + // Cooldown check uses `<`, so exactly at cooldown should pass. + assert.Equal(t, action.Type, BalloonActionShrink) +} + +func TestBalloonController_SwapActivityBlocksShrink(t *testing.T) { + cfg := newTestConfig() + cfg.MaxSwapInPerSec = 64 * 1024 * 1024 // 64 MiB/s. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + m := idleMetrics() + m.SwapInBytesPerSec = 100 * 1024 * 1024 // 100 MiB/s — above threshold. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_ContainerActivityBlocksShrink(t *testing.T) { + cfg := newTestConfig() + cfg.MaxContainerCPU = 10.0 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + m := idleMetrics() + m.ContainerCount = 2 + m.ContainerCpuPercent = 15.0 // Above threshold. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_PollFailureResets(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + // 2 failures, then success — should stay in steady (transient, no change). + ctrl.RecordPollFailure() + ctrl.RecordPollFailure() + ctrl.RecordPollSuccess() + assert.Equal(t, ctrl.state, BalloonStateSteady) + + // Need 3 more failures to trigger partial grow. + ctrl.RecordPollFailure() + ctrl.RecordPollFailure() + assert.Equal(t, ctrl.state, BalloonStateSteady) // Not yet (only 2). +} + +func TestBalloonController_ShrinkStepLargerThanCurrent(t *testing.T) { + // When shrinkStep > currentBytes, target should clamp to min. + cfg := newTestConfig() + cfg.ShrinkStepPercent = 100 // 100% of max = 12 GiB step. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 5 * 1024 * 1024 * 1024 // 5 GiB. + + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Assert(t, action.TargetBytes >= cfg.MinBytes) +} + +func TestBalloonController_IdleGracePeriodBlocksShrink(t *testing.T) { + cfg := newTestConfig() + cfg.IdleGracePeriod = 5 * time.Minute + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + + // Boot time is recent — within grace period. + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-2*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_NilMetrics(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + // nil metrics should not panic; returns no-op. + action := ctrl.Evaluate(nil, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_IntegerMathOOM(t *testing.T) { + // Verify OOM growth uses integer math (currentBytes + currentBytes/5). + cfg := newTestConfig() + cfg.MaxMemoryBytes = 20 * 1024 * 1024 * 1024 // 20 GiB. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + // Set current to 10 GiB via RecordAction. + ctrl.RecordAction(BalloonAction{Type: BalloonActionGrow, TargetBytes: 10 * 1024 * 1024 * 1024}, time.Time{}) + + m := &api.MemoryMetrics{ + OomDetected: true, + PsiMemorySome_10: 0.1, + } + action := ctrl.Evaluate(m, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + // 10 GiB + 10 GiB/5 = 12 GiB. + expected := uint64(12 * 1024 * 1024 * 1024) + assert.Equal(t, action.TargetBytes, expected) +} + +func TestBalloonController_IntegerMathGrowStep(t *testing.T) { + // Verify grow step uses integer math (maxBytes * percent / 100). + cfg := newTestConfig() + cfg.MaxMemoryBytes = 12 * 1024 * 1024 * 1024 // 12 GiB. + cfg.GrowStepPercent = 25 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + // Set current to 8 GiB via RecordAction. + ctrl.RecordAction(BalloonAction{Type: BalloonActionGrow, TargetBytes: 8 * 1024 * 1024 * 1024}, time.Time{}) + + m := &api.MemoryMetrics{PsiMemorySome_10: 0.95} // High pressure. + action := ctrl.Evaluate(m, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + // step = 12 GiB * 25 / 100 (multiply-first for consistency with shrink). + step := uint64(12*1024*1024*1024) * 25 / 100 + expected := uint64(8*1024*1024*1024) + step + assert.Equal(t, action.TargetBytes, expected) +} + +// --- E1: PSI Full Hard Stop tests --- + +func TestBalloon_PsiFullBlocksShrink(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 // Above idle target. + + // PSI some below low threshold (would normally shrink), but full > 0. + m := idleMetrics() + m.PsiMemorySome_10 = 0.1 + m.PsiMemoryFull_10 = 0.05 + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloon_PsiFullAllowsGrow(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 4 * 1024 * 1024 * 1024 + + // PSI some above high threshold AND full > 0 — should grow. + m := pressureMetrics() + m.PsiMemorySome_10 = 1.0 + m.PsiMemoryFull_10 = 0.9 + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) +} + +func TestBalloon_PsiFullGrowBoost(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 4 * 1024 * 1024 * 1024 + + // High pressure with PSI full > 0 — should get 1.5× grow step. + m := pressureMetrics() + m.PsiMemorySome_10 = 0.95 + m.PsiMemoryFull_10 = 0.5 + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + + // Verify boosted step: step = MaxMem * GrowPct / 100 * 3/2. + normalStep := cfg.MaxMemoryBytes * uint64(cfg.GrowStepPercent) / 100 + boostedStep := normalStep * 3 / 2 + expected := uint64(4*1024*1024*1024) + boostedStep + assert.Equal(t, action.TargetBytes, expected) +} + +func TestBalloon_PsiFullZeroNoEffect(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // PSI full = 0 — normal shrink behavior, E1 has no effect. + m := idleMetrics() + m.PsiMemorySome_10 = 0.1 + m.PsiMemoryFull_10 = 0.0 + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +// --- E2: Swap-Out Rate Signal tests --- + +func TestBalloon_SwapOutBlocksShrink(t *testing.T) { + cfg := newTestConfig() + cfg.MaxSwapOutPerSec = 32 * 1024 * 1024 // 32 MiB/s. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.SwapOutBytesPerSec = 40 * 1024 * 1024 // 40 MiB/s — above threshold. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloon_SwapOutBelowThreshold(t *testing.T) { + cfg := newTestConfig() + cfg.MaxSwapOutPerSec = 32 * 1024 * 1024 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.SwapOutBytesPerSec = 5 * 1024 * 1024 // 5 MiB/s — below threshold. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +func TestBalloon_SwapOutZeroThreshold(t *testing.T) { + cfg := newTestConfig() + cfg.MaxSwapOutPerSec = 0 // Disabled. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.SwapOutBytesPerSec = 100 * 1024 * 1024 // High swap-out, but check disabled. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +func TestBalloon_PageFaultRateBlocksShrink(t *testing.T) { + cfg := newTestConfig() + cfg.MaxPageFaultRate = 5000 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.PageFaultRate = 6000 // Above threshold. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloon_PageFaultRateBelowThreshold(t *testing.T) { + cfg := newTestConfig() + cfg.MaxPageFaultRate = 5000 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.PageFaultRate = 1000 // Below threshold. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +func TestBalloon_MemAvailableReserveBlocksShrink(t *testing.T) { + cfg := newTestConfig() + cfg.ShrinkReserveBytes = 128 * 1024 * 1024 // 128 MiB. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.MemAvailableBytes = 100 * 1024 * 1024 // 100 MiB — below reserve + step. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloon_MemAvailableReserveAllowsShrink(t *testing.T) { + cfg := newTestConfig() + cfg.ShrinkReserveBytes = 128 * 1024 * 1024 // 128 MiB. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.MemAvailableBytes = 2 * 1024 * 1024 * 1024 // 2 GiB — well above reserve + step. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +// --- E3: Settle Window tests --- + +func TestBalloon_SettleWindowPreventsEarlyShrink(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + + // First evaluate: starts settle window, returns none. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) + assert.Assert(t, !ctrl.lowPressureSince.IsZero()) + + // Second evaluate: still within 30s, returns none. + action = ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) + + // Simulate 30s passing. + ctrl.lowPressureSince = time.Now().Add(-31 * time.Second) + action = ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +func TestBalloon_SettleWindowResetOnHighPressure(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // Start settle window. + ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Assert(t, !ctrl.lowPressureSince.IsZero()) + + // High pressure resets settle window. + ctrl.Evaluate(pressureMetrics(), time.Now().Add(-6*time.Minute)) + assert.Assert(t, ctrl.lowPressureSince.IsZero()) +} + +func TestBalloon_SettleWindowZeroDisabled(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 0 // Disabled. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +func TestBalloon_SettleWindowWithCooldown(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + cfg.Cooldown = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // Settle window passed but cooldown hasn't — cooldown blocks. + ctrl.lowPressureSince = time.Now().Add(-31 * time.Second) + ctrl.RecordAction(BalloonAction{Type: BalloonActionShrink, TargetBytes: 8 * 1024 * 1024 * 1024}, time.Now()) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloon_SettleResetOnOOM(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // Accumulate settle window. + ctrl.lowPressureSince = time.Now().Add(-20 * time.Second) + assert.Assert(t, !ctrl.lowPressureSince.IsZero()) + + // OOM resets settle window. + ctrl.RecordOOM(time.Now()) + assert.Assert(t, ctrl.lowPressureSince.IsZero()) +} + +func TestBalloon_SettlePsiFullInteraction(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // Settle window has passed. + ctrl.lowPressureSince = time.Now().Add(-31 * time.Second) + + // But PSI full > 0 — E1 guard fires before settle check. + m := idleMetrics() + m.PsiMemorySome_10 = 0.1 + m.PsiMemoryFull_10 = 0.05 + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +// --- E4: Host-Side Memory Pressure tests --- + +func TestBalloon_HostCriticalShrinks(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureCritical) + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Equal(t, action.Reason, "host critical pressure") + // 2× step: MaxMem * ShrinkPct * 2 / 100. + assert.Assert(t, action.TargetBytes < ctrl.currentBytes) + assert.Assert(t, action.TargetBytes >= cfg.MinBytes) +} + +func TestBalloon_HostCriticalRespectsDistress(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureCritical) + m := idleMetrics() + m.PsiMemoryFull_10 = 8.0 // Severe guest distress > 5.0. + m.PsiMemorySome_10 = 0.1 // Below high threshold. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloon_HostCriticalAllowsMildFullStall(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureCritical) + m := idleMetrics() + m.PsiMemoryFull_10 = 3.0 // Mild full stall <= 5.0 — tolerable under critical. + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Equal(t, action.Reason, "host critical pressure") +} + +func TestBalloon_HostCriticalRespectsFloor(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.MinBytes + 100*1024*1024 // Just above min. + + ctrl.SetHostPressure(HostPressureCritical) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Equal(t, action.TargetBytes, cfg.MinBytes) +} + +func TestBalloon_HostCriticalUnderflowGuard(t *testing.T) { + cfg := newTestConfig() + cfg.ShrinkStepPercent = 80 // 80% of 12 GiB = 9.6 GiB; 2× = 19.2 GiB > currentBytes. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 5 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureCritical) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Equal(t, action.TargetBytes, cfg.MinBytes) // Clamped to min, not underflowed. +} + +func TestBalloon_HostCriticalRespectsCooldown(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureCritical) + ctrl.RecordAction(BalloonAction{Type: BalloonActionShrink, TargetBytes: 8 * 1024 * 1024 * 1024}, time.Now()) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) // Cooldown blocks. +} + +func TestBalloon_HostWarningBypassesSettle(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureWarning) + // No settle window accumulated — Warning bypasses it. + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Equal(t, action.Reason, "host warning pressure") +} + +func TestBalloon_HostWarningRespectsCooldown(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureWarning) + ctrl.RecordAction(BalloonAction{Type: BalloonActionShrink, TargetBytes: 8 * 1024 * 1024 * 1024}, time.Now()) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) // Cooldown blocks. +} + +func TestBalloon_HostWarningDoesNotResetLowPressureSince(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + settleTime := time.Now().Add(-20 * time.Second) + ctrl.lowPressureSince = settleTime + + ctrl.SetHostPressure(HostPressureWarning) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Equal(t, ctrl.lowPressureSince, settleTime) // Not reset by Warning shrink. +} + +func TestBalloon_HostNormalSettleRequired(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + ctrl.SetHostPressure(HostPressureNormal) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) // Settle window not met. +} + +func TestBalloon_HostCriticalToNormalRecovery(t *testing.T) { + cfg := newTestConfig() + cfg.SettleWindow = 30 * time.Second + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // Critical shrink. + ctrl.SetHostPressure(HostPressureCritical) + action := ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + ctrl.RecordAction(action, time.Now().Add(-31*time.Second)) // Pretend action was 31s ago. + + // Switch to Normal — settle window required. + ctrl.SetHostPressure(HostPressureNormal) + action = ctrl.Evaluate(idleMetrics(), time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) // Settle window just started. + assert.Assert(t, !ctrl.lowPressureSince.IsZero()) +} + +// --- E5: Learned Stable Floor tests --- + +func TestBalloon_LearnDescendOnLowPressure(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateLearningDescend) + ctrl.currentBytes = 6 * 1024 * 1024 * 1024 // 6 GiB. + + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + assert.Equal(t, action.Reason, "learning: descending to find floor") +} + +func TestBalloon_LearnDetectInstability(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateLearningDescend) + ctrl.currentBytes = 5 * 1024 * 1024 * 1024 + + // PSI spike during descend — should set candidate floor. + m := pressureMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Equal(t, ctrl.state, BalloonStateLearningConfirm) + assert.Assert(t, ctrl.candidateFloor > 0) +} + +func TestBalloon_LearnConfirmStable(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.instDir = t.TempDir() + ctrl.TransitionTo(BalloonStateLearningConfirm) + ctrl.currentBytes = 5 * 1024 * 1024 * 1024 + ctrl.candidateFloor = 5 * 1024 * 1024 * 1024 + ctrl.confirmStartTime = time.Now().Add(-6 * time.Minute) // 6 min ago > learnConfirmDuration. + + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) + assert.Equal(t, ctrl.state, BalloonStateSteady) + assert.Equal(t, ctrl.learnedFloor, uint64(5*1024*1024*1024)) + + // Verify persistence. + v, _, err := store.ReadLearnedFloor(ctrl.instDir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(5*1024*1024*1024)) +} + +func TestBalloon_LearnConfirmUnstable(t *testing.T) { + cfg := newTestConfig() + cfg.IdleTargetBytes = 8 * 1024 * 1024 * 1024 // Room for candidate to grow. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateLearningConfirm) + ctrl.currentBytes = 5 * 1024 * 1024 * 1024 + ctrl.candidateFloor = 5 * 1024 * 1024 * 1024 + ctrl.confirmStartTime = time.Now() + + // PSI spike during confirm — should raise candidate. + m := pressureMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Equal(t, ctrl.confirmFails, 1) + assert.Assert(t, ctrl.candidateFloor > uint64(5*1024*1024*1024)) +} + +func TestBalloon_LearnedFloorBlocksShrink(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 6 * 1024 * 1024 * 1024 + ctrl.learnedFloor = 5 * 1024 * 1024 * 1024 // Learned floor at 5 GiB. + + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + // Should shrink but respect learnedFloor as hardFloor. + if action.Type == BalloonActionShrink { + assert.Assert(t, action.TargetBytes >= ctrl.learnedFloor) + } +} + +func TestBalloon_LearnedFloorReset(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.learnedFloor = 5 * 1024 * 1024 * 1024 + + // OOM should reset learned floor. + ctrl.RecordOOM(time.Now()) + assert.Equal(t, ctrl.learnedFloor, uint64(0)) +} + +func TestBalloon_LearnedFloorPersistence(t *testing.T) { + dir := t.TempDir() + err := store.WriteLearnedFloor(dir, 4*1024*1024*1024, time.Now()) + assert.NilError(t, err) + + v, _, err := store.ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(4*1024*1024*1024)) +} + +func TestBalloon_LearnedFloorPersistenceCorrupt(t *testing.T) { + dir := t.TempDir() + v, _, err := store.ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(0)) // Not found → 0. +} + +func TestBalloon_LearnNotStartedAtBoot(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + // Containers running — learning should NOT trigger. + m := idleMetrics() + m.ContainerCount = 3 + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) + // State should remain Steady (not LearningDescend). + assert.Equal(t, ctrl.state, BalloonStateSteady) +} + +func TestBalloon_LearnDescendReachesMin(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.instDir = t.TempDir() + ctrl.TransitionTo(BalloonStateLearningDescend) + // Set currentBytes near min. + step := cfg.MaxMemoryBytes * uint64(cfg.ShrinkStepPercent) / 100 + ctrl.currentBytes = cfg.MinBytes + step - 1 // Just below min + step. + + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) + assert.Equal(t, ctrl.state, BalloonStateSteady) + assert.Equal(t, ctrl.learnedFloor, cfg.MinBytes+step) +} + +func TestBalloon_LearnedFloorInvalidatedOnResize(t *testing.T) { + cfg := newTestConfig() + // Learned floor > idleTarget should be discarded. + floor := cfg.IdleTargetBytes + 1024 + assert.Assert(t, floor > cfg.IdleTargetBytes) + // Simulate startup validation. + if floor > cfg.IdleTargetBytes || floor < cfg.MinBytes { + floor = 0 + } + assert.Equal(t, floor, uint64(0)) + + // Learned floor < min should be discarded. + floor2 := cfg.MinBytes - 1 + if floor2 > cfg.IdleTargetBytes || floor2 < cfg.MinBytes { + floor2 = 0 + } + assert.Equal(t, floor2, uint64(0)) +} + +func TestBalloon_LearnSuspendsUnderHostPressure(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateLearningDescend) + ctrl.currentBytes = 6 * 1024 * 1024 * 1024 + + // Host Warning — learning should suspend. + ctrl.SetHostPressure(HostPressureWarning) + m := idleMetrics() + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) + assert.Equal(t, ctrl.state, BalloonStateLearningDescend) // Still in learning, just suspended. +} + +// --- E10-3: Graduated poll failure tests --- + +func TestBalloon_GraduatedPollFailure_TransientNoChange(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // 1-2 failures should not change memory and return nil (no action). + action := ctrl.RecordPollFailure() + assert.Assert(t, action == nil) + assert.Equal(t, ctrl.currentBytes, cfg.IdleTargetBytes) + assert.Equal(t, ctrl.state, BalloonStateSteady) + + action = ctrl.RecordPollFailure() + assert.Assert(t, action == nil) + assert.Equal(t, ctrl.currentBytes, cfg.IdleTargetBytes) + assert.Equal(t, ctrl.state, BalloonStateSteady) +} + +func TestBalloon_GraduatedPollFailure_ThreeGrowsHalfHeadroom(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes // 4 GiB + + // 3 failures should grow by 50% of headroom. + // headroom = 12 GiB - 4 GiB = 8 GiB; half = 4 GiB; target = 8 GiB. + var action *BalloonAction + for range 3 { + action = ctrl.RecordPollFailure() + } + assert.Assert(t, action != nil) + assert.Equal(t, action.Type, BalloonActionGrow) + expectedTarget := cfg.IdleTargetBytes + (cfg.MaxMemoryBytes-cfg.IdleTargetBytes)/2 + assert.Equal(t, action.TargetBytes, expectedTarget) + assert.Equal(t, ctrl.currentBytes, expectedTarget) + assert.Equal(t, ctrl.state, BalloonStateSteady) // NOT agent failure yet. +} + +func TestBalloon_GraduatedPollFailure_SixFullExpansion(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // 6 failures should expand to max and enter agent failure. + var action *BalloonAction + for range 6 { + action = ctrl.RecordPollFailure() + } + assert.Assert(t, action != nil) + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Equal(t, action.TargetBytes, cfg.MaxMemoryBytes) + assert.Equal(t, ctrl.currentBytes, cfg.MaxMemoryBytes) + assert.Equal(t, ctrl.state, BalloonStateAgentFailure) +} + +func TestBalloon_GraduatedPollFailure_RecoveryResets(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // 4 failures (partial grow), then success resets. + for range 4 { + ctrl.RecordPollFailure() + } + assert.Assert(t, ctrl.currentBytes > cfg.IdleTargetBytes) + assert.Equal(t, ctrl.state, BalloonStateSteady) + + ctrl.RecordPollSuccess() + assert.Equal(t, ctrl.pollFailures, 0) + + // Need 3 more failures to trigger partial grow again. + ctrl.RecordPollFailure() + ctrl.RecordPollFailure() + assert.Equal(t, ctrl.state, BalloonStateSteady) +} + +// --- E10-1: PSI availability fallback tests --- + +func TestBalloon_PsiUnavailable_BlocksShrinkWhenMemTight(t *testing.T) { + // When PSI is unavailable (all zeros) and MemAvailable < 40% of currentBytes, + // the balloon should NOT shrink. + cfg := newTestConfig() + cfg.SettleWindow = 0 // Disable settle window for simplicity. + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 // 8 GiB + + m := &api.MemoryMetrics{ + MemTotalBytes: 12 * 1024 * 1024 * 1024, + MemAvailableBytes: 2 * 1024 * 1024 * 1024, // 2 GiB < 40% of 8 GiB (3.2 GiB) + PsiMemorySome_10: 0.0, // All zeros — PSI not available. + PsiMemoryFull_10: 0.0, + AnonRssBytes: 5 * 1024 * 1024 * 1024, + } + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloon_PsiUnavailable_AllowsShrinkWhenMemPlentiful(t *testing.T) { + // When PSI is unavailable but MemAvailable >= 40% of currentBytes, + // the balloon CAN shrink (safe to do so). + cfg := newTestConfig() + cfg.SettleWindow = 0 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 // 8 GiB + + m := &api.MemoryMetrics{ + MemTotalBytes: 12 * 1024 * 1024 * 1024, + MemAvailableBytes: 6 * 1024 * 1024 * 1024, // 6 GiB > 40% of 8 GiB (3.2 GiB) + PsiMemorySome_10: 0.0, + PsiMemoryFull_10: 0.0, + AnonRssBytes: 1 * 1024 * 1024 * 1024, + } + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +func TestBalloon_PsiBecomesAvailable(t *testing.T) { + // Once PSI produces a non-zero value, psiAvailable should be set true + // and the fallback guard no longer applies. + cfg := newTestConfig() + cfg.SettleWindow = 0 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + + assert.Assert(t, !ctrl.psiAvailable) + + // First poll with non-zero PSI sets psiAvailable. + m := &api.MemoryMetrics{ + MemTotalBytes: 12 * 1024 * 1024 * 1024, + MemAvailableBytes: 2 * 1024 * 1024 * 1024, + PsiMemorySome_10: 0.05, // Non-zero — PSI is working. + PsiMemoryFull_10: 0.0, + AnonRssBytes: 5 * 1024 * 1024 * 1024, + } + ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Assert(t, ctrl.psiAvailable) +} + +func TestBalloon_PsiAvailable_NormalBehavior(t *testing.T) { + // After PSI becomes available, the controller uses normal PSI-based + // logic (no MemAvailable guard). + cfg := newTestConfig() + cfg.SettleWindow = 0 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 8 * 1024 * 1024 * 1024 + ctrl.psiAvailable = true // Already discovered PSI works. + + // Low PSI + low MemAvailable — with PSI available, normal shrink logic applies + // (PSI says no pressure → shrink is permitted). + m := &api.MemoryMetrics{ + MemTotalBytes: 12 * 1024 * 1024 * 1024, + MemAvailableBytes: 2 * 1024 * 1024 * 1024, + PsiMemorySome_10: 0.1, // Below low threshold — low pressure. + PsiMemoryFull_10: 0.0, + AnonRssBytes: 1 * 1024 * 1024 * 1024, + } + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + assert.Equal(t, action.Type, BalloonActionShrink) +} + +// --- E10-2: Floor staleness tests --- + +func TestBalloon_FloorStaleness_StaleFloorDiscarded(t *testing.T) { + // A floor older than FloorStaleness is discarded (effectiveFloor returns 0). + cfg := newTestConfig() + cfg.FloorStaleness = 24 * time.Hour + ctrl := NewBalloonController(cfg) + ctrl.learnedFloor = 5 * 1024 * 1024 * 1024 + ctrl.learnedAt = time.Now().Add(-25 * time.Hour) // 25h ago — stale. + + assert.Equal(t, ctrl.effectiveFloor(), uint64(0)) + // After discard, internal state is cleared. + assert.Equal(t, ctrl.learnedFloor, uint64(0)) + assert.Assert(t, ctrl.learnedAt.IsZero()) +} + +func TestBalloon_FloorStaleness_FreshFloorKept(t *testing.T) { + // A floor newer than FloorStaleness is kept. + cfg := newTestConfig() + cfg.FloorStaleness = 24 * time.Hour + ctrl := NewBalloonController(cfg) + ctrl.learnedFloor = 5 * 1024 * 1024 * 1024 + ctrl.learnedAt = time.Now().Add(-12 * time.Hour) // 12h ago — fresh. + + assert.Equal(t, ctrl.effectiveFloor(), uint64(5*1024*1024*1024)) +} + +func TestBalloon_FloorStaleness_ZeroMeansNeverStale(t *testing.T) { + // FloorStaleness=0 means the floor never becomes stale. + cfg := newTestConfig() + cfg.FloorStaleness = 0 // Disabled. + ctrl := NewBalloonController(cfg) + ctrl.learnedFloor = 5 * 1024 * 1024 * 1024 + ctrl.learnedAt = time.Now().Add(-1000 * time.Hour) // Very old. + + assert.Equal(t, ctrl.effectiveFloor(), uint64(5*1024*1024*1024)) +} + +func TestBalloon_FloorStaleness_ZeroTimeTreatedAsKept(t *testing.T) { + // Zero learnedAt (old format) with FloorStaleness set: the staleness + // check requires !learnedAt.IsZero(), so zero time skips the check. + cfg := newTestConfig() + cfg.FloorStaleness = 24 * time.Hour + ctrl := NewBalloonController(cfg) + ctrl.learnedFloor = 5 * 1024 * 1024 * 1024 + ctrl.learnedAt = time.Time{} // Zero — unknown timestamp. + + // With zero learnedAt, the staleness check is skipped (we can't know the age). + // The floor is kept as-is — the hostagent.go range check already validated it. + assert.Equal(t, ctrl.effectiveFloor(), uint64(5*1024*1024*1024)) +} + +// --- E10-8: Adaptive AnonRss margin tests --- + +func TestBalloon_AdaptiveMargin_NoContainers(t *testing.T) { + // 0 containers → 5% margin. + cfg := newTestConfig() + cfg.SettleWindow = 0 + cfg.MinBytes = 1 * 1024 * 1024 * 1024 + cfg.IdleTargetBytes = 12*1024*1024*1024 - 1 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 6 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.AnonRssBytes = 4 * 1024 * 1024 * 1024 // 4 GiB AnonRss. + m.ContainerCount = 0 // No containers → 5% margin. + + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + if action.Type == BalloonActionShrink { + // Floor = 4 GiB * 1.05 = 4.2 GiB. Target must be >= floor. + expectedFloor := uint64(4*1024*1024*1024) + uint64(4*1024*1024*1024)*5/100 + assert.Assert(t, action.TargetBytes >= expectedFloor, + "target %d should be >= 5%% floor %d", action.TargetBytes, expectedFloor) + } +} + +func TestBalloon_AdaptiveMargin_FewContainers(t *testing.T) { + // 1-5 containers → 15% margin. + cfg := newTestConfig() + cfg.SettleWindow = 0 + cfg.MinBytes = 1 * 1024 * 1024 * 1024 + cfg.IdleTargetBytes = 12*1024*1024*1024 - 1 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 6 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.AnonRssBytes = 4 * 1024 * 1024 * 1024 + m.ContainerCount = 3 // Few containers → 15% margin. + + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + if action.Type == BalloonActionShrink { + expectedFloor := uint64(4*1024*1024*1024) + uint64(4*1024*1024*1024)*15/100 + assert.Assert(t, action.TargetBytes >= expectedFloor, + "target %d should be >= 15%% floor %d", action.TargetBytes, expectedFloor) + } +} + +func TestBalloon_AdaptiveMargin_ManyContainers(t *testing.T) { + // 6+ containers → 20% margin. + cfg := newTestConfig() + cfg.SettleWindow = 0 + cfg.MinBytes = 1 * 1024 * 1024 * 1024 + cfg.IdleTargetBytes = 12*1024*1024*1024 - 1 + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = 6 * 1024 * 1024 * 1024 + + m := idleMetrics() + m.AnonRssBytes = 4 * 1024 * 1024 * 1024 + m.ContainerCount = 8 // Many containers → 20% margin. + + action := ctrl.Evaluate(m, time.Now().Add(-6*time.Minute)) + if action.Type == BalloonActionShrink { + expectedFloor := uint64(4*1024*1024*1024) + uint64(4*1024*1024*1024)*20/100 + assert.Assert(t, action.TargetBytes >= expectedFloor, + "target %d should be >= 20%% floor %d", action.TargetBytes, expectedFloor) + } +} + +func TestBalloonController_NoPSI_GrowOnCriticalMemAvailable(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + // Simulate a prior shrink. + ctrl.currentBytes = cfg.IdleTargetBytes + // psiAvailable remains false (default). + + // MemAvailable at 10% of currentBytes — below the 15% critical threshold. + m := idleMetrics() + m.MemAvailableBytes = ctrl.currentBytes * 10 / 100 + // PSI is zero (unavailable). + m.PsiMemorySome_10 = 0 + m.PsiMemoryFull_10 = 0 + + action := ctrl.Evaluate(m, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionGrow) + assert.Assert(t, strings.Contains(action.Reason, "no PSI"), + "expected 'no PSI' in reason, got %q", action.Reason) + assert.Assert(t, action.TargetBytes > ctrl.currentBytes, + "grow should increase target") +} + +func TestBalloonController_NoPSI_BlockShrinkAt30Percent(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + ctrl.currentBytes = cfg.IdleTargetBytes + + // MemAvailable at 30% — below 40% but above 15%. + // Should return none (block shrink, no grow). + m := idleMetrics() + m.MemAvailableBytes = ctrl.currentBytes * 30 / 100 + m.PsiMemorySome_10 = 0 + m.PsiMemoryFull_10 = 0 + + action := ctrl.Evaluate(m, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone) +} + +func TestBalloonController_NoPSI_NoOscillation(t *testing.T) { + cfg := newTestConfig() + ctrl := NewBalloonController(cfg) + ctrl.TransitionTo(BalloonStateSteady) + // Simulate state after a grow: at 7 GiB with moderate MemAvailable. + ctrl.currentBytes = 7 * 1024 * 1024 * 1024 + ctrl.lastActionTime = time.Now().Add(-5 * time.Minute) // Well past cooldown. + + // MemAvailable = 3 GiB — healthy at 7 GiB, but would be critical at 4 GiB. + // Shrinking to idleTarget (4 GiB) shrinks by 3 GiB. + // Expected MemAvail after = 3 - 3 = 0 GiB. + // 0 / 4 GiB = 0% — below 20% headroom, so shrink should be blocked. + m := idleMetrics() + m.MemAvailableBytes = 3 * 1024 * 1024 * 1024 // 3 GiB. + m.PsiMemorySome_10 = 0 + m.PsiMemoryFull_10 = 0 + + action := ctrl.Evaluate(m, time.Now().Add(-10*time.Minute)) + assert.Equal(t, action.Type, BalloonActionNone, + "should block shrink that would cause oscillation") +} diff --git a/pkg/hostagent/host_pressure.go b/pkg/hostagent/host_pressure.go new file mode 100644 index 00000000000..027c2e903c7 --- /dev/null +++ b/pkg/hostagent/host_pressure.go @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +package hostagent + +// HostPressure represents the macOS host memory pressure level. +type HostPressure int + +const ( + HostPressureNormal HostPressure = iota + HostPressureWarning + HostPressureCritical +) + +// classifyLevel maps a kern.memorystatus_level value (0-100, higher = more free) +// to a HostPressure level. +func classifyLevel(level uint32) HostPressure { + switch { + case level <= 10: + return HostPressureCritical + case level <= 25: + return HostPressureWarning + default: + return HostPressureNormal + } +} diff --git a/pkg/hostagent/host_pressure_darwin.go b/pkg/hostagent/host_pressure_darwin.go new file mode 100644 index 00000000000..3ca361d6500 --- /dev/null +++ b/pkg/hostagent/host_pressure_darwin.go @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build darwin + +package hostagent + +import ( + "context" + "sync" + "time" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// HostPressureMonitor polls macOS kern.memorystatus_level to track host memory pressure. +// Uses hysteresis: transitions require 2 consecutive samples at the new level (except +// transitions TO Critical, which are immediate). +type HostPressureMonitor struct { + mu sync.RWMutex + current HostPressure + pending HostPressure // Candidate state from latest poll. + pendingCount int // Consecutive polls at pending state. + confirmSamples int // Required consecutive samples before transition. +} + +// NewHostPressureMonitor creates a monitor that defaults to HostPressureNormal. +func NewHostPressureMonitor() *HostPressureMonitor { + return &HostPressureMonitor{ + confirmSamples: 2, // Require 2 consecutive samples (10s at 5s poll). + } +} + +// Run polls kern.memorystatus_level every 5 seconds until ctx is cancelled. +func (m *HostPressureMonitor) Run(ctx context.Context) { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + for { + select { + case <-ticker.C: + m.poll() + case <-ctx.Done(): + return + } + } +} + +func (m *HostPressureMonitor) poll() { + level, err := unix.SysctlUint32("kern.memorystatus_level") + if err != nil { + logrus.Debugf("host pressure: kern.memorystatus_level unavailable: %v", err) + return + } + candidate := classifyLevel(level) + m.mu.Lock() + defer m.mu.Unlock() + m.transition(candidate) +} + +// transition applies a candidate pressure level with hysteresis. +// Must be called with m.mu held. +func (m *HostPressureMonitor) transition(candidate HostPressure) { + if candidate == m.current { + m.pendingCount = 0 + return + } + if candidate == m.pending { + m.pendingCount++ + } else { + m.pending = candidate + m.pendingCount = 1 + } + if candidate == HostPressureCritical || m.pendingCount >= m.confirmSamples { + m.current = candidate + m.pendingCount = 0 + } +} + +// Current returns the latest host pressure reading. +func (m *HostPressureMonitor) Current() HostPressure { + m.mu.RLock() + defer m.mu.RUnlock() + return m.current +} diff --git a/pkg/hostagent/host_pressure_darwin_test.go b/pkg/hostagent/host_pressure_darwin_test.go new file mode 100644 index 00000000000..154a33b19ab --- /dev/null +++ b/pkg/hostagent/host_pressure_darwin_test.go @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build darwin + +package hostagent + +import ( + "testing" + + "gotest.tools/v3/assert" +) + +func TestHostPressure_Hysteresis_NormalToWarning(t *testing.T) { + m := NewHostPressureMonitor() + assert.Equal(t, m.Current(), HostPressureNormal) + + // First Warning reading — not enough, stays Normal. + m.mu.Lock() + m.transition(HostPressureWarning) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureNormal) + + // Second consecutive Warning reading — transitions. + m.mu.Lock() + m.transition(HostPressureWarning) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureWarning) +} + +func TestHostPressure_Hysteresis_WarningToNormal(t *testing.T) { + m := NewHostPressureMonitor() + // Get to Warning state first. + m.mu.Lock() + m.current = HostPressureWarning + m.mu.Unlock() + + // First Normal reading — not enough. + m.mu.Lock() + m.transition(HostPressureNormal) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureWarning) + + // Second consecutive Normal reading — transitions. + m.mu.Lock() + m.transition(HostPressureNormal) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureNormal) +} + +func TestHostPressure_Hysteresis_ImmediateCritical(t *testing.T) { + m := NewHostPressureMonitor() + assert.Equal(t, m.Current(), HostPressureNormal) + + // Single Critical reading — immediate transition. + m.mu.Lock() + m.transition(HostPressureCritical) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureCritical) +} + +func TestHostPressure_Hysteresis_CriticalToWarning(t *testing.T) { + m := NewHostPressureMonitor() + m.mu.Lock() + m.current = HostPressureCritical + m.mu.Unlock() + + // First Warning — not enough. + m.mu.Lock() + m.transition(HostPressureWarning) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureCritical) + + // Second consecutive Warning — transitions. + m.mu.Lock() + m.transition(HostPressureWarning) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureWarning) +} + +func TestHostPressure_Hysteresis_InterruptResets(t *testing.T) { + m := NewHostPressureMonitor() + + // One Warning reading. + m.mu.Lock() + m.transition(HostPressureWarning) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureNormal) + + // Interrupted by Normal — resets pending count. + m.mu.Lock() + m.transition(HostPressureNormal) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureNormal) + + // One Warning again — counter restarted, not enough. + m.mu.Lock() + m.transition(HostPressureWarning) + m.mu.Unlock() + assert.Equal(t, m.Current(), HostPressureNormal) +} diff --git a/pkg/hostagent/host_pressure_other.go b/pkg/hostagent/host_pressure_other.go new file mode 100644 index 00000000000..3c1e296a943 --- /dev/null +++ b/pkg/hostagent/host_pressure_other.go @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build !darwin + +package hostagent + +import "context" + +// HostPressureMonitor is a no-op on non-Darwin platforms. +type HostPressureMonitor struct{} + +// NewHostPressureMonitor returns a stub monitor that always reports normal pressure. +func NewHostPressureMonitor() *HostPressureMonitor { return &HostPressureMonitor{} } + +// Run is a no-op on non-Darwin. +func (m *HostPressureMonitor) Run(_ context.Context) {} + +// Current always returns HostPressureNormal on non-Darwin. +func (m *HostPressureMonitor) Current() HostPressure { return HostPressureNormal } diff --git a/pkg/hostagent/host_pressure_test.go b/pkg/hostagent/host_pressure_test.go new file mode 100644 index 00000000000..6731e42b02c --- /dev/null +++ b/pkg/hostagent/host_pressure_test.go @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +package hostagent + +import ( + "testing" + + "gotest.tools/v3/assert" +) + +func TestHostPressureMonitor_ParseLevels(t *testing.T) { + tests := []struct { + name string + level uint32 + want HostPressure + }{ + {"critical low", 0, HostPressureCritical}, + {"critical boundary", 10, HostPressureCritical}, + {"warning boundary low", 11, HostPressureWarning}, + {"warning mid", 20, HostPressureWarning}, + {"warning boundary high", 25, HostPressureWarning}, + {"normal boundary", 26, HostPressureNormal}, + {"normal mid", 50, HostPressureNormal}, + {"normal full", 100, HostPressureNormal}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := classifyLevel(tt.level) + assert.Equal(t, got, tt.want) + }) + } +} diff --git a/pkg/hostagent/hostagent.go b/pkg/hostagent/hostagent.go index 08e9d48007a..380e6514983 100644 --- a/pkg/hostagent/hostagent.go +++ b/pkg/hostagent/hostagent.go @@ -21,6 +21,7 @@ import ( "sync" "time" + "github.com/docker/go-units" "github.com/lima-vm/sshocker/pkg/ssh" "github.com/sethvargo/go-password/password" "github.com/sirupsen/logrus" @@ -82,6 +83,8 @@ type HostAgent struct { showProgress bool // whether to show cloud-init progress + balloonCtrl *BalloonController // nil when memory ballooning is not enabled + statusMu sync.RWMutex currentStatus events.Status } @@ -643,6 +646,62 @@ sudo chown -R "${USER}" /run/host-services` if err := a.waitForRequirements("final", a.finalRequirements()); err != nil { errs = append(errs, err) } + + // Start balloon controller if the driver supports ballooning and memoryBalloon is enabled. + // Unwrap ConfiguredDriver to access the underlying driver's Ballooner interface. + balloonDriver := a.driver + if cd, ok := a.driver.(*driver.ConfiguredDriver); ok { + balloonDriver = cd.Driver + } + if ballooner, ok := balloonDriver.(driver.Ballooner); ok { + if a.instConfig.VMOpts != nil { + var vzOpts limatype.VZOpts + if convErr := limayaml.Convert(a.instConfig.VMOpts[limatype.VZ], &vzOpts, ""); convErr == nil { + if vzOpts.MemoryBalloon.Enabled != nil && *vzOpts.MemoryBalloon.Enabled { + cfg, cfgErr := parseBalloonConfig(a.instConfig, &vzOpts.MemoryBalloon) + if cfgErr != nil { + logrus.WithError(cfgErr).Warn("Failed to parse balloon config, ballooning disabled") + } else { + ctrl := NewBalloonController(cfg) + ctrl.instDir = a.instDir + // Load persisted learned floor with timestamp. + floor, learnedAt, floorErr := store.ReadLearnedFloor(a.instDir) + if floorErr != nil { + logrus.Warnf("balloon: failed to read learned floor: %v", floorErr) + } + if floor > 0 && (floor > cfg.IdleTargetBytes || floor < cfg.MinBytes) { + logrus.Infof("balloon: discarding out-of-range learned floor %d (min=%d, idle=%d)", + floor, cfg.MinBytes, cfg.IdleTargetBytes) + floor = 0 + learnedAt = time.Time{} + } + ctrl.learnedFloor = floor + ctrl.learnedAt = learnedAt + a.balloonCtrl = ctrl + monitor := NewHostPressureMonitor() + go monitor.Run(ctx) + go func() { + defer func() { + if r := recover(); r != nil { + logrus.Errorf("Balloon controller panicked: %v", r) + } + }() + a.runBalloonLoop(ctx, ctrl, ballooner, monitor) + }() + minStr, idleStr := "0", "0" + if vzOpts.MemoryBalloon.Min != nil { + minStr = *vzOpts.MemoryBalloon.Min + } + if vzOpts.MemoryBalloon.IdleTarget != nil { + idleStr = *vzOpts.MemoryBalloon.IdleTarget + } + logrus.Infof("Memory ballooning enabled: min %s, idle target %s", minStr, idleStr) + } + } + } + } + } + // Copy all config files _after_ the requirements are done for _, rule := range a.instConfig.CopyToHost { sshAddress, sshPort := a.sshAddressPort() @@ -1161,3 +1220,120 @@ func copyToHost(ctx context.Context, sshConfig *ssh.SSHConfig, sshAddress string } return nil } + +// parseBalloonConfig converts the YAML MemoryBalloon config into a BalloonConfig. +func parseBalloonConfig(instConfig *limatype.LimaYAML, balloon *limatype.MemoryBalloon) (BalloonConfig, error) { + var cfg BalloonConfig + + if instConfig.Memory != nil { + memBytes, err := units.RAMInBytes(*instConfig.Memory) + if err != nil { + return cfg, fmt.Errorf("invalid memory value: %w", err) + } + cfg.MaxMemoryBytes = uint64(memBytes) + } + if balloon.Min != nil { + minBytes, err := units.RAMInBytes(*balloon.Min) + if err != nil { + return cfg, fmt.Errorf("invalid balloon min: %w", err) + } + cfg.MinBytes = uint64(minBytes) + } + if balloon.IdleTarget != nil { + idleBytes, err := units.RAMInBytes(*balloon.IdleTarget) + if err != nil { + return cfg, fmt.Errorf("invalid balloon idleTarget: %w", err) + } + cfg.IdleTargetBytes = uint64(idleBytes) + } + if balloon.Cooldown != nil { + d, err := time.ParseDuration(*balloon.Cooldown) + if err != nil { + return cfg, fmt.Errorf("invalid balloon cooldown: %w", err) + } + cfg.Cooldown = d + } + + // Hardcoded internal defaults for tuning knobs. + cfg.GrowStepPercent = 25 + cfg.ShrinkStepPercent = 20 + cfg.HighPressureThreshold = 5.0 + cfg.LowPressureThreshold = 0.5 + cfg.IdleGracePeriod = 5 * time.Minute + cfg.MaxSwapInPerSec = 64 * 1024 * 1024 // 64 MiB/s. + cfg.MaxSwapOutPerSec = 32 * 1024 * 1024 // 32 MiB/s. + cfg.MaxPageFaultRate = 5000 + cfg.ShrinkReserveBytes = 128 * 1024 * 1024 // 128 MiB. + cfg.SettleWindow = 30 * time.Second + cfg.MaxContainerCPU = 10.0 + cfg.MaxContainerIO = 10 * 1024 * 1024 // 10 MiB/s. + cfg.FloorStaleness = 24 * time.Hour + + return cfg, nil +} + +// runBalloonLoop periodically polls guest memory metrics and adjusts the balloon. +func (a *HostAgent) runBalloonLoop(ctx context.Context, ctrl *BalloonController, ballooner driver.Ballooner, monitor *HostPressureMonitor) { + // Wait for guest agent to be ready. + select { + case <-a.guestAgentAliveCh: + case <-ctx.Done(): + return + } + + bootTime := time.Now() + ctrl.TransitionTo(BalloonStateSteady) + logrus.Info("Balloon controller: transitioned to steady state") + + // 10s poll interval balances responsiveness with guest agent overhead. + // Each poll fetches /proc/pressure/memory + container stats via gRPC. + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + action := ctrl.PrepareShutdown() + if err := ballooner.SetBalloonTarget(action.TargetBytes); err != nil { + logrus.WithError(err).Warn("Balloon controller: failed to grow on shutdown") + } + return + + case <-ticker.C: + client, err := a.getOrCreateClient(ctx) + if err != nil { + if action := ctrl.RecordPollFailure(); action != nil { + if err := ballooner.SetBalloonTarget(action.TargetBytes); err != nil { + logrus.WithError(err).Warnf("Balloon controller: failed to grow on poll failure") + } + } + logrus.WithError(err).Debug("Balloon controller: failed to get guest agent client") + continue + } + metrics, err := client.GetMemoryMetrics(ctx) + if err != nil { + if action := ctrl.RecordPollFailure(); action != nil { + if err := ballooner.SetBalloonTarget(action.TargetBytes); err != nil { + logrus.WithError(err).Warnf("Balloon controller: failed to grow on poll failure") + } + } + logrus.WithError(err).Debug("Balloon controller: failed to get memory metrics") + continue + } + ctrl.RecordPollSuccess() + + ctrl.SetHostPressure(monitor.Current()) + action := ctrl.Evaluate(metrics, bootTime) + if action.Type != BalloonActionNone { + logrus.Infof("Balloon controller: %s -> %s (%s)", + action.Type, units.BytesSize(float64(action.TargetBytes)), action.Reason) + if err := ballooner.SetBalloonTarget(action.TargetBytes); err != nil { + logrus.WithError(err).Warnf("Balloon controller: failed to set target to %s", + units.BytesSize(float64(action.TargetBytes))) + } else { + ctrl.RecordAction(action, time.Now()) + } + } + } + } +} diff --git a/pkg/store/learned_floor.go b/pkg/store/learned_floor.go new file mode 100644 index 00000000000..5e1f3e75cae --- /dev/null +++ b/pkg/store/learned_floor.go @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +package store + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" +) + +// WriteLearnedFloor atomically persists the balloon controller's learned stable floor +// with a timestamp for staleness tracking. +// Format: "\n". +func WriteLearnedFloor(instDir string, bytes uint64, learnedAt time.Time) error { + content := strconv.FormatInt(learnedAt.Unix(), 10) + "\n" + strconv.FormatUint(bytes, 10) + tmp := filepath.Join(instDir, "learned-floor.tmp") + if err := os.WriteFile(tmp, []byte(content), 0o600); err != nil { + return fmt.Errorf("writing learned floor: %w", err) + } + return os.Rename(tmp, filepath.Join(instDir, "learned-floor")) +} + +// ReadLearnedFloor reads the persisted learned floor and its timestamp. +// Returns (0, zero-time, nil) if not found or corrupt. +// Supports old format (bare uint64) by returning zero time (immediately stale). +func ReadLearnedFloor(instDir string) (uint64, time.Time, error) { + data, err := os.ReadFile(filepath.Join(instDir, "learned-floor")) + if errors.Is(err, os.ErrNotExist) { + return 0, time.Time{}, nil + } + if err != nil { + return 0, time.Time{}, fmt.Errorf("reading learned floor: %w", err) + } + content := strings.TrimSpace(string(data)) + lines := strings.SplitN(content, "\n", 2) + + if len(lines) == 2 { + // New format: "\n". + ts, tsErr := strconv.ParseInt(lines[0], 10, 64) + floor, fErr := strconv.ParseUint(lines[1], 10, 64) + if tsErr != nil || fErr != nil { + return 0, time.Time{}, nil // Corrupt. + } + return floor, time.Unix(ts, 0), nil + } + + // Old format: bare uint64 — treat as zero time (immediately stale). + v, err := strconv.ParseUint(content, 10, 64) + if err != nil { + return 0, time.Time{}, nil // Corrupt. + } + return v, time.Time{}, nil +} diff --git a/pkg/store/learned_floor_test.go b/pkg/store/learned_floor_test.go new file mode 100644 index 00000000000..97ff498c64c --- /dev/null +++ b/pkg/store/learned_floor_test.go @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: Copyright The Lima Authors +// SPDX-License-Identifier: Apache-2.0 + +package store + +import ( + "os" + "path/filepath" + "testing" + "time" + + "gotest.tools/v3/assert" +) + +func TestLearnedFloor_WriteRead(t *testing.T) { + dir := t.TempDir() + now := time.Now() + err := WriteLearnedFloor(dir, 4*1024*1024*1024, now) + assert.NilError(t, err) + + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(4*1024*1024*1024)) + assert.Equal(t, learnedAt.Unix(), now.Unix()) +} + +func TestLearnedFloor_NotFound(t *testing.T) { + dir := t.TempDir() + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(0)) + assert.Assert(t, learnedAt.IsZero()) +} + +func TestLearnedFloor_Corrupt(t *testing.T) { + dir := t.TempDir() + err := os.WriteFile(filepath.Join(dir, "learned-floor"), []byte("garbage"), 0o600) + assert.NilError(t, err) + + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(0)) + assert.Assert(t, learnedAt.IsZero()) +} + +func TestLearnedFloor_Overwrite(t *testing.T) { + dir := t.TempDir() + t1 := time.Now().Add(-1 * time.Hour) + err := WriteLearnedFloor(dir, 3*1024*1024*1024, t1) + assert.NilError(t, err) + + t2 := time.Now() + err = WriteLearnedFloor(dir, 5*1024*1024*1024, t2) + assert.NilError(t, err) + + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(5*1024*1024*1024)) + assert.Equal(t, learnedAt.Unix(), t2.Unix()) +} + +// --- E10-2: Learned floor with timestamp --- + +func TestLearnedFloor_WriteReadWithTimestamp(t *testing.T) { + dir := t.TempDir() + now := time.Now() + err := WriteLearnedFloor(dir, 4*1024*1024*1024, now) + assert.NilError(t, err) + + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(4*1024*1024*1024)) + // Unix timestamp precision: seconds. + assert.Assert(t, learnedAt.Unix() == now.Unix()) +} + +func TestLearnedFloor_OldFormatTreatedAsStale(t *testing.T) { + dir := t.TempDir() + // Write old format (bare uint64, no timestamp). + err := os.WriteFile(filepath.Join(dir, "learned-floor"), []byte("4294967296"), 0o600) + assert.NilError(t, err) + + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(4294967296)) + // Old format has zero time → treated as stale. + assert.Assert(t, learnedAt.IsZero()) +} + +func TestLearnedFloor_NotFoundWithTimestamp(t *testing.T) { + dir := t.TempDir() + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(0)) + assert.Assert(t, learnedAt.IsZero()) +} + +func TestLearnedFloor_CorruptWithTimestamp(t *testing.T) { + dir := t.TempDir() + err := os.WriteFile(filepath.Join(dir, "learned-floor"), []byte("garbage"), 0o600) + assert.NilError(t, err) + + v, learnedAt, err := ReadLearnedFloor(dir) + assert.NilError(t, err) + assert.Equal(t, v, uint64(0)) + assert.Assert(t, learnedAt.IsZero()) +} From 849a3f330dc798816728e451f04cc00ea72cd3e2 Mon Sep 17 00:00:00 2001 From: "Jason W. Ehrlich" Date: Fri, 17 Apr 2026 13:58:21 -0400 Subject: [PATCH 4/4] vz: wire balloon controller to VZ driver Add the Ballooner interface to the driver package with SetBalloonTarget for adjusting guest memory at runtime. Implement Ballooner in the VZ driver: store the VirtIO balloon device reference during VM configuration, and expose SetBalloonTarget which calls the Virtualization.framework API under a mutex. Wire the balloon controller into hostagent: parseBalloonConfig converts the 4 public YAML fields into internal BalloonConfig with hardcoded operational defaults, setupBalloon initializes the controller after SSH is ready, and runBalloonLoop polls guest metrics every 10 seconds to feed the controller's Evaluate method. Fill sensible defaults in the VZ driver's FillConfig: min at 25% of memory, idleTarget at 33%, and cooldown at 30 seconds. Signed-off-by: Jason W. Ehrlich --- pkg/driver/vz/vm_darwin.go | 14 +++++++++-- pkg/driver/vz/vz_driver_darwin.go | 41 +++++++++++++++++++++++++++++++ pkg/hostagent/hostagent.go | 4 +++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/pkg/driver/vz/vm_darwin.go b/pkg/driver/vz/vm_darwin.go index b9761131afe..fd8a091f2c7 100644 --- a/pkg/driver/vz/vm_darwin.go +++ b/pkg/driver/vz/vm_darwin.go @@ -48,8 +48,9 @@ const diskImageCachingMode = vz.DiskImageCachingModeCached type virtualMachineWrapper struct { *vz.VirtualMachine - mu sync.Mutex - stopped bool + mu sync.Mutex + stopped bool + balloonDevice *vz.VirtioTraditionalMemoryBalloonDevice } // Hold all *os.File created via socketpair() so that they won't get garbage collected. f.FD() gets invalid if f gets garbage collected. @@ -72,6 +73,15 @@ func startVM(ctx context.Context, inst *limatype.Instance, sshLocalPort int, onV } wrapper := &virtualMachineWrapper{VirtualMachine: machine, stopped: false} + + // Capture the balloon device reference for runtime memory control. + for _, dev := range machine.MemoryBalloonDevices() { + if bd := vz.AsVirtioTraditionalMemoryBalloonDevice(dev); bd != nil { + wrapper.balloonDevice = bd + break + } + } + notifySSHLocalPortAccessible := make(chan any) sendErrCh := make(chan error) diff --git a/pkg/driver/vz/vz_driver_darwin.go b/pkg/driver/vz/vz_driver_darwin.go index 87bb60938ee..48d5d6fb64c 100644 --- a/pkg/driver/vz/vz_driver_darwin.go +++ b/pkg/driver/vz/vz_driver_darwin.go @@ -207,6 +207,32 @@ func (l *LimaVzDriver) FillConfig(ctx context.Context, cfg *limatype.LimaYAML, _ } } + // Memory balloon defaults — only set when enabled. + if vzOpts.MemoryBalloon.Enabled == nil { + vzOpts.MemoryBalloon.Enabled = ptr.Of(false) + } + if *vzOpts.MemoryBalloon.Enabled { + if vzOpts.MemoryBalloon.Min == nil { + // 25% of configured memory. + if cfg.Memory != nil { + if memBytes, err := units.RAMInBytes(*cfg.Memory); err == nil { + vzOpts.MemoryBalloon.Min = ptr.Of(units.BytesSize(float64(memBytes) * 0.25)) + } + } + } + if vzOpts.MemoryBalloon.IdleTarget == nil { + // 33% of configured memory. + if cfg.Memory != nil { + if memBytes, err := units.RAMInBytes(*cfg.Memory); err == nil { + vzOpts.MemoryBalloon.IdleTarget = ptr.Of(units.BytesSize(float64(memBytes) * 0.33)) + } + } + } + if vzOpts.MemoryBalloon.Cooldown == nil { + vzOpts.MemoryBalloon.Cooldown = ptr.Of("30s") + } + } + var opts any if err := limayaml.Convert(vzOpts, &opts, ""); err != nil { logrus.WithError(err).Warnf("Couldn't convert %+v", vzOpts) @@ -536,6 +562,21 @@ func (l *LimaVzDriver) GuestAgentConn(_ context.Context) (net.Conn, string, erro return nil, "", errors.New("unable to connect to guest agent via vsock port 2222") } +// SetBalloonTarget adjusts the balloon device to set the target memory size in bytes. +// The balloon inflates or deflates to control memory available to the guest. +func (l *LimaVzDriver) SetBalloonTarget(targetBytes uint64) error { + if l.machine == nil { + return errors.New("vz: VM is not running") + } + l.machine.mu.Lock() + defer l.machine.mu.Unlock() + if l.machine.balloonDevice == nil { + return errors.New("vz: no balloon device available") + } + l.machine.balloonDevice.SetTargetVirtualMachineMemorySize(targetBytes) + return nil +} + func (l *LimaVzDriver) Info() driver.Info { var info driver.Info diff --git a/pkg/hostagent/hostagent.go b/pkg/hostagent/hostagent.go index 380e6514983..95cea4d4458 100644 --- a/pkg/hostagent/hostagent.go +++ b/pkg/hostagent/hostagent.go @@ -1305,6 +1305,8 @@ func (a *HostAgent) runBalloonLoop(ctx context.Context, ctrl *BalloonController, if action := ctrl.RecordPollFailure(); action != nil { if err := ballooner.SetBalloonTarget(action.TargetBytes); err != nil { logrus.WithError(err).Warnf("Balloon controller: failed to grow on poll failure") + } else { + ctrl.RecordAction(*action, time.Now()) } } logrus.WithError(err).Debug("Balloon controller: failed to get guest agent client") @@ -1315,6 +1317,8 @@ func (a *HostAgent) runBalloonLoop(ctx context.Context, ctrl *BalloonController, if action := ctrl.RecordPollFailure(); action != nil { if err := ballooner.SetBalloonTarget(action.TargetBytes); err != nil { logrus.WithError(err).Warnf("Balloon controller: failed to grow on poll failure") + } else { + ctrl.RecordAction(*action, time.Now()) } } logrus.WithError(err).Debug("Balloon controller: failed to get memory metrics")