@@ -54,7 +54,7 @@ func NewCollector(dockerSocket *string) *Collector {
5454 return (& net.Dialer {}).DialContext (ctx , "unix" , c .dockerSocket )
5555 },
5656 },
57- Timeout : 5 * time . Second ,
57+ // No client-level timeout — per-request context deadlines control timeouts.
5858 }
5959 }
6060 return c
@@ -119,7 +119,7 @@ func safeDelta(curr, prev uint64) uint64 {
119119
120120// Collect gathers all memory metrics and returns a MemoryMetrics protobuf.
121121// This is the main entry point called by the guest agent gRPC server.
122- func (c * Collector ) Collect () (* api.MemoryMetrics , error ) {
122+ func (c * Collector ) Collect (ctx context. Context ) (* api.MemoryMetrics , error ) {
123123 // 1. /proc/meminfo + /proc/pressure/memory (no lock needed — pure reads).
124124 meminfo , err := os .ReadFile ("/proc/meminfo" )
125125 if err != nil {
@@ -131,12 +131,14 @@ func (c *Collector) Collect() (*api.MemoryMetrics, error) {
131131 }
132132
133133 pressure , _ := os .ReadFile ("/proc/pressure/memory" )
134- some10 , full10 , parseErr := parseProcPressureMemory (pressure )
134+ psi , parseErr := parseProcPressureMemory (pressure )
135135 if parseErr != nil {
136136 return nil , parseErr
137137 }
138- m .PsiMemorySome_10 = some10
139- m .PsiMemoryFull_10 = full10
138+ m .PsiMemorySome_10 = psi .Some10
139+ m .PsiMemoryFull_10 = psi .Full10
140+ m .PsiMemorySome_60 = psi .Some60
141+ m .PsiMemoryFull_60 = psi .Full60
140142
141143 // 2. /proc/vmstat for swap rates, page faults, OOM.
142144 vmstatData , vmstatErr := os .ReadFile ("/proc/vmstat" )
@@ -145,7 +147,7 @@ func (c *Collector) Collect() (*api.MemoryMetrics, error) {
145147 var dockerCount int
146148 var dockerCPU , dockerIO float64
147149 if c .httpClient != nil {
148- dockerCount , dockerCPU , dockerIO = c .collectDockerStats ()
150+ dockerCount , dockerCPU , dockerIO = c .collectDockerStats (ctx )
149151 }
150152
151153 // Hold lock only for internal state updates and reads.
@@ -171,10 +173,15 @@ func (c *Collector) Collect() (*api.MemoryMetrics, error) {
171173}
172174
173175// collectDockerStats queries the Docker socket for container count,
174- // aggregate CPU%, and aggregate IO bytes/sec. Returns zeros on error.
175- func (c * Collector ) collectDockerStats () (count int , cpuPercent , ioBytesPerSec float64 ) {
176+ // aggregate CPU%, and aggregate IO bytes/sec. Containers are polled
177+ // in parallel with a 3-second overall timeout. Returns zeros on error.
178+ func (c * Collector ) collectDockerStats (ctx context.Context ) (count int , cpuPercent , ioBytesPerSec float64 ) {
179+ // Overall timeout: 3 seconds fits within the 10-second balloon poll interval.
180+ ctx , cancel := context .WithTimeout (ctx , 3 * time .Second )
181+ defer cancel ()
182+
176183 // List running containers.
177- listReq , err := http .NewRequestWithContext (context . Background () , http .MethodGet ,
184+ listReq , err := http .NewRequestWithContext (ctx , http .MethodGet ,
178185 "http://localhost/containers/json?filters=%7B%22status%22%3A%5B%22running%22%5D%7D" , http .NoBody )
179186 if err != nil {
180187 logrus .Debugf ("Docker stats: failed to create list request: %v" , err )
@@ -201,30 +208,53 @@ func (c *Collector) collectDockerStats() (count int, cpuPercent, ioBytesPerSec f
201208 return 0 , 0 , 0
202209 }
203210
204- // Aggregate stats from each container (best-effort, skip failures).
211+ // Poll all containers in parallel with per-container 1-second timeout.
212+ type result struct {
213+ cpuPct float64
214+ ioBytes uint64
215+ }
216+ results := make (chan result , len (ids ))
217+ for _ , id := range ids {
218+ go func (cid string ) {
219+ cctx , ccancel := context .WithTimeout (ctx , 1 * time .Second )
220+ defer ccancel ()
221+ statsReq , reqErr := http .NewRequestWithContext (cctx , http .MethodGet ,
222+ "http://localhost/containers/" + cid + "/stats?stream=false&one-shot=true" , http .NoBody )
223+ if reqErr != nil {
224+ results <- result {}
225+ return
226+ }
227+ statsResp , doErr := c .httpClient .Do (statsReq )
228+ if doErr != nil {
229+ results <- result {}
230+ return
231+ }
232+ statsBody , readErr := io .ReadAll (statsResp .Body )
233+ statsResp .Body .Close ()
234+ if readErr != nil {
235+ results <- result {}
236+ return
237+ }
238+ cpuPct , ioBytes , parseErr := parseDockerStats (statsBody )
239+ if parseErr != nil {
240+ results <- result {}
241+ return
242+ }
243+ results <- result {cpuPct , ioBytes }
244+ }(id )
245+ }
246+
247+ // Collect results, using partial data if overall timeout hits.
205248 var totalCPU float64
206249 var totalIO uint64
207- for _ , id := range ids {
208- statsReq , reqErr := http .NewRequestWithContext (context .Background (), http .MethodGet ,
209- "http://localhost/containers/" + id + "/stats?stream=false&one-shot=true" , http .NoBody )
210- if reqErr != nil {
211- continue
212- }
213- statsResp , doErr := c .httpClient .Do (statsReq )
214- if doErr != nil {
215- continue
216- }
217- statsBody , readErr := io .ReadAll (statsResp .Body )
218- statsResp .Body .Close ()
219- if readErr != nil {
220- continue
221- }
222- cpuPct , ioBytes , parseErr := parseDockerStats (statsBody )
223- if parseErr != nil {
224- continue
250+ for range ids {
251+ select {
252+ case r := <- results :
253+ totalCPU += r .cpuPct
254+ totalIO += r .ioBytes
255+ case <- ctx .Done ():
256+ return count , totalCPU , float64 (totalIO )
225257 }
226- totalCPU += cpuPct
227- totalIO += ioBytes
228258 }
229259
230260 return count , totalCPU , float64 (totalIO )
0 commit comments