Skip to content

Commit af2b6d1

Browse files
committed
hostagent,guestagent: adaptive balloon enhancements (E1-E10)
Add 10 targeted improvements to the balloon controller, auto-pause manager, guest metrics, and template provisioning: E1: PSI fallback guard — detect kernels without PSI and fall back to MemAvailable-based decisions. Add mode:boot script to enable PSI on Alpine kernels with CONFIG_PSI_DEFAULT_DISABLED=y. E2: Floor staleness timeout — learned floors expire after 24h (config: floorStaleness) to adapt to workload drift. Store format extended to include timestamp. E3: Graduated poll failure recovery — replace immediate max-expansion with graduated steps: 1-2 failures no-op, 3-5 half headroom, 6+ full. E4: Balloon and auto-pause coordination — halve shrink step when SSH/ socket connections are active. Skip balloon polling while VM is paused. Reset poll failure counter on resume. E5: Parallel Docker stats — collect per-container stats concurrently with 3s overall and 1s per-container timeouts. E6: Dynamic cgroup memory.high — recalculate memory.high every 5min based on MemAvailable+AnonPages with 512MiB floor. E7: Host pressure hysteresis — require 2 consecutive samples before level transitions (except immediate Critical) to prevent oscillation. E8: Adaptive AnonRss margin — replace fixed 15% with container-count tiers: 5% (0 containers), 15% (1-5), 20% (6+). E9: PSI trend detection — add avg60 proto fields. Pre-emptive half-step grow when avg10 > 1.5*avg60 (opt-in via enableTrendDetection). E10: Demand-driven compaction — buddyinfo-gated compact, slab drop threshold, guarded MGLRU/KSM re-enable. Signed-off-by: Jason W. Ehrlich <jwehrlich@outlook.com>
1 parent 6c6bb5b commit af2b6d1

22 files changed

Lines changed: 2246 additions & 131 deletions

pkg/driver/vz/vz_driver_darwin.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,12 +312,30 @@ func (l *LimaVzDriver) FillConfig(ctx context.Context, cfg *limatype.LimaYAML, _
312312
if vzOpts.MemoryBalloon.MaxSwapInPerSec == nil {
313313
vzOpts.MemoryBalloon.MaxSwapInPerSec = ptr.Of("64MiB")
314314
}
315+
if vzOpts.MemoryBalloon.MaxSwapOutPerSec == nil {
316+
vzOpts.MemoryBalloon.MaxSwapOutPerSec = ptr.Of("32MiB")
317+
}
318+
if vzOpts.MemoryBalloon.MaxPageFaultRate == nil {
319+
vzOpts.MemoryBalloon.MaxPageFaultRate = ptr.Of(uint64(5000))
320+
}
321+
if vzOpts.MemoryBalloon.ShrinkReserveBytes == nil {
322+
vzOpts.MemoryBalloon.ShrinkReserveBytes = ptr.Of("128MiB")
323+
}
324+
if vzOpts.MemoryBalloon.SettleWindow == nil {
325+
vzOpts.MemoryBalloon.SettleWindow = ptr.Of("30s")
326+
}
315327
if vzOpts.MemoryBalloon.MaxContainerCPU == nil {
316328
vzOpts.MemoryBalloon.MaxContainerCPU = ptr.Of(10.0)
317329
}
318330
if vzOpts.MemoryBalloon.MaxContainerIO == nil {
319331
vzOpts.MemoryBalloon.MaxContainerIO = ptr.Of("10MiB")
320332
}
333+
if vzOpts.MemoryBalloon.FloorStaleness == nil {
334+
vzOpts.MemoryBalloon.FloorStaleness = ptr.Of("24h")
335+
}
336+
if vzOpts.MemoryBalloon.EnableTrendDetection == nil {
337+
vzOpts.MemoryBalloon.EnableTrendDetection = ptr.Of(false)
338+
}
321339
}
322340

323341
// Auto-pause defaults — only set when enabled.

pkg/guestagent/api/guestservice.pb.desc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-

2+

33
guestservice.protogoogle/protobuf/empty.protogoogle/protobuf/timestamp.proto"0
44
Info(
55
local_ports ( 2.IPPortR
@@ -29,7 +29,7 @@ guest_addr ( R guestAddr&
2929
TimeSyncResponse
3030
adjusted (Radjusted
3131
drift_ms (RdriftMs
32-
error ( Rerror"�
32+
error ( Rerror"�
3333
MemoryMetrics&
3434
mem_total_bytes (RmemTotalBytes.
3535
mem_available_bytes (RmemAvailableBytes(
@@ -46,7 +46,9 @@ guest_addr ( R guestAddr&
4646
container_count (RcontainerCount2
4747
container_cpu_percent (RcontainerCpuPercent:
4848
container_io_bytes_per_sec (RcontainerIoBytesPerSec!
49-
oom_detected (R oomDetected2�
49+
oom_detected (R oomDetected+
50+
psi_memory_some_60 (RpsiMemorySome60+
51+
psi_memory_full_60 (RpsiMemoryFull602�
5052
GuestService(
5153
GetInfo.google.protobuf.Empty.Info-
5254
GetEvents.google.protobuf.Empty.Event01

pkg/guestagent/api/guestservice.pb.go

Lines changed: 24 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/guestagent/api/guestservice.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,8 @@ message MemoryMetrics {
8484

8585
// OOM detection (edge-triggered: set once, cleared after reporting).
8686
bool oom_detected = 15;
87+
88+
// /proc/pressure/memory PSI values (percentage 0.0-100.0 over 60s window).
89+
double psi_memory_some_60 = 16;
90+
double psi_memory_full_60 = 17;
8791
}

pkg/guestagent/api/server/server.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@ func (s *GuestServer) SyncTime(_ context.Context, req *api.TimeSyncRequest) (*ap
109109
return resp, nil
110110
}
111111

112-
func (s *GuestServer) GetMemoryMetrics(_ context.Context, _ *emptypb.Empty) (*api.MemoryMetrics, error) {
112+
func (s *GuestServer) GetMemoryMetrics(ctx context.Context, _ *emptypb.Empty) (*api.MemoryMetrics, error) {
113113
if s.Collector != nil {
114-
return s.Collector.Collect()
114+
return s.Collector.Collect(ctx)
115115
}
116116
return metrics.CollectMemoryMetrics()
117117
}

pkg/guestagent/metrics/collector.go

Lines changed: 60 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ func NewCollector(dockerSocket *string) *Collector {
5454
return (&net.Dialer{}).DialContext(ctx, "unix", c.dockerSocket)
5555
},
5656
},
57-
Timeout: 5 * time.Second,
57+
// No client-level timeout — per-request context deadlines control timeouts.
5858
}
5959
}
6060
return c
@@ -119,7 +119,7 @@ func safeDelta(curr, prev uint64) uint64 {
119119

120120
// Collect gathers all memory metrics and returns a MemoryMetrics protobuf.
121121
// This is the main entry point called by the guest agent gRPC server.
122-
func (c *Collector) Collect() (*api.MemoryMetrics, error) {
122+
func (c *Collector) Collect(ctx context.Context) (*api.MemoryMetrics, error) {
123123
// 1. /proc/meminfo + /proc/pressure/memory (no lock needed — pure reads).
124124
meminfo, err := os.ReadFile("/proc/meminfo")
125125
if err != nil {
@@ -131,12 +131,14 @@ func (c *Collector) Collect() (*api.MemoryMetrics, error) {
131131
}
132132

133133
pressure, _ := os.ReadFile("/proc/pressure/memory")
134-
some10, full10, parseErr := parseProcPressureMemory(pressure)
134+
psi, parseErr := parseProcPressureMemory(pressure)
135135
if parseErr != nil {
136136
return nil, parseErr
137137
}
138-
m.PsiMemorySome_10 = some10
139-
m.PsiMemoryFull_10 = full10
138+
m.PsiMemorySome_10 = psi.Some10
139+
m.PsiMemoryFull_10 = psi.Full10
140+
m.PsiMemorySome_60 = psi.Some60
141+
m.PsiMemoryFull_60 = psi.Full60
140142

141143
// 2. /proc/vmstat for swap rates, page faults, OOM.
142144
vmstatData, vmstatErr := os.ReadFile("/proc/vmstat")
@@ -145,7 +147,7 @@ func (c *Collector) Collect() (*api.MemoryMetrics, error) {
145147
var dockerCount int
146148
var dockerCPU, dockerIO float64
147149
if c.httpClient != nil {
148-
dockerCount, dockerCPU, dockerIO = c.collectDockerStats()
150+
dockerCount, dockerCPU, dockerIO = c.collectDockerStats(ctx)
149151
}
150152

151153
// Hold lock only for internal state updates and reads.
@@ -171,10 +173,15 @@ func (c *Collector) Collect() (*api.MemoryMetrics, error) {
171173
}
172174

173175
// collectDockerStats queries the Docker socket for container count,
174-
// aggregate CPU%, and aggregate IO bytes/sec. Returns zeros on error.
175-
func (c *Collector) collectDockerStats() (count int, cpuPercent, ioBytesPerSec float64) {
176+
// aggregate CPU%, and aggregate IO bytes/sec. Containers are polled
177+
// in parallel with a 3-second overall timeout. Returns zeros on error.
178+
func (c *Collector) collectDockerStats(ctx context.Context) (count int, cpuPercent, ioBytesPerSec float64) {
179+
// Overall timeout: 3 seconds fits within the 10-second balloon poll interval.
180+
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
181+
defer cancel()
182+
176183
// List running containers.
177-
listReq, err := http.NewRequestWithContext(context.Background(), http.MethodGet,
184+
listReq, err := http.NewRequestWithContext(ctx, http.MethodGet,
178185
"http://localhost/containers/json?filters=%7B%22status%22%3A%5B%22running%22%5D%7D", http.NoBody)
179186
if err != nil {
180187
logrus.Debugf("Docker stats: failed to create list request: %v", err)
@@ -201,30 +208,53 @@ func (c *Collector) collectDockerStats() (count int, cpuPercent, ioBytesPerSec f
201208
return 0, 0, 0
202209
}
203210

204-
// Aggregate stats from each container (best-effort, skip failures).
211+
// Poll all containers in parallel with per-container 1-second timeout.
212+
type result struct {
213+
cpuPct float64
214+
ioBytes uint64
215+
}
216+
results := make(chan result, len(ids))
217+
for _, id := range ids {
218+
go func(cid string) {
219+
cctx, ccancel := context.WithTimeout(ctx, 1*time.Second)
220+
defer ccancel()
221+
statsReq, reqErr := http.NewRequestWithContext(cctx, http.MethodGet,
222+
"http://localhost/containers/"+cid+"/stats?stream=false&one-shot=true", http.NoBody)
223+
if reqErr != nil {
224+
results <- result{}
225+
return
226+
}
227+
statsResp, doErr := c.httpClient.Do(statsReq)
228+
if doErr != nil {
229+
results <- result{}
230+
return
231+
}
232+
statsBody, readErr := io.ReadAll(statsResp.Body)
233+
statsResp.Body.Close()
234+
if readErr != nil {
235+
results <- result{}
236+
return
237+
}
238+
cpuPct, ioBytes, parseErr := parseDockerStats(statsBody)
239+
if parseErr != nil {
240+
results <- result{}
241+
return
242+
}
243+
results <- result{cpuPct, ioBytes}
244+
}(id)
245+
}
246+
247+
// Collect results, using partial data if overall timeout hits.
205248
var totalCPU float64
206249
var totalIO uint64
207-
for _, id := range ids {
208-
statsReq, reqErr := http.NewRequestWithContext(context.Background(), http.MethodGet,
209-
"http://localhost/containers/"+id+"/stats?stream=false&one-shot=true", http.NoBody)
210-
if reqErr != nil {
211-
continue
212-
}
213-
statsResp, doErr := c.httpClient.Do(statsReq)
214-
if doErr != nil {
215-
continue
216-
}
217-
statsBody, readErr := io.ReadAll(statsResp.Body)
218-
statsResp.Body.Close()
219-
if readErr != nil {
220-
continue
221-
}
222-
cpuPct, ioBytes, parseErr := parseDockerStats(statsBody)
223-
if parseErr != nil {
224-
continue
250+
for range ids {
251+
select {
252+
case r := <-results:
253+
totalCPU += r.cpuPct
254+
totalIO += r.ioBytes
255+
case <-ctx.Done():
256+
return count, totalCPU, float64(totalIO)
225257
}
226-
totalCPU += cpuPct
227-
totalIO += ioBytes
228258
}
229259

230260
return count, totalCPU, float64(totalIO)

pkg/guestagent/metrics/metrics.go

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,14 @@ func CollectMemoryMetrics() (*api.MemoryMetrics, error) {
2828
}
2929

3030
pressure, _ := os.ReadFile("/proc/pressure/memory")
31-
some10, full10, err := parseProcPressureMemory(pressure)
31+
psi, err := parseProcPressureMemory(pressure)
3232
if err != nil {
3333
return nil, err
3434
}
35-
m.PsiMemorySome_10 = some10
36-
m.PsiMemoryFull_10 = full10
35+
m.PsiMemorySome_10 = psi.Some10
36+
m.PsiMemoryFull_10 = psi.Full10
37+
m.PsiMemorySome_60 = psi.Some60
38+
m.PsiMemoryFull_60 = psi.Full60
3739

3840
return m, nil
3941
}
@@ -72,9 +74,16 @@ func parseProcMeminfo(data []byte) (*api.MemoryMetrics, error) {
7274
return m, scanner.Err()
7375
}
7476

75-
func parseProcPressureMemory(data []byte) (some10, full10 float64, err error) {
77+
// PressureStats holds parsed PSI values from /proc/pressure/memory.
78+
type PressureStats struct {
79+
Some10, Full10 float64
80+
Some60, Full60 float64
81+
}
82+
83+
func parseProcPressureMemory(data []byte) (PressureStats, error) {
84+
var ps PressureStats
7685
if len(data) == 0 {
77-
return 0, 0, nil
86+
return ps, nil
7887
}
7988
scanner := bufio.NewScanner(bytes.NewReader(data))
8089
for scanner.Scan() {
@@ -92,12 +101,24 @@ func parseProcPressureMemory(data []byte) (some10, full10 float64, err error) {
92101
}
93102
switch kind {
94103
case "some":
95-
some10 = val
104+
ps.Some10 = val
105+
case "full":
106+
ps.Full10 = val
107+
}
108+
}
109+
if after, ok := strings.CutPrefix(field, "avg60="); ok {
110+
val, parseErr := strconv.ParseFloat(after, 64)
111+
if parseErr != nil {
112+
continue
113+
}
114+
switch kind {
115+
case "some":
116+
ps.Some60 = val
96117
case "full":
97-
full10 = val
118+
ps.Full60 = val
98119
}
99120
}
100121
}
101122
}
102-
return some10, full10, scanner.Err()
123+
return ps, scanner.Err()
103124
}

0 commit comments

Comments
 (0)