Skip to content

Commit 34ea61f

Browse files
committed
suppress metrics for short-lived containers
1 parent d6dae5c commit 34ea61f

2 files changed

Lines changed: 32 additions & 5 deletions

File tree

containers/container.go

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,26 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
243243
c.lock.Lock()
244244
defer c.lock.Unlock()
245245

246+
if taskstatsClient != nil {
247+
deadPids := c.updateDelaysLocked()
248+
for _, pid := range deadPids {
249+
c.onProcessExitLocked(pid, false)
250+
}
251+
}
252+
253+
if minAge := *flags.MinContainerAge; minAge > 0 {
254+
if c.startedAt.IsZero() {
255+
return
256+
}
257+
end := time.Now()
258+
if !c.zombieAt.IsZero() && c.zombieAt.Before(end) {
259+
end = c.zombieAt
260+
}
261+
if end.Sub(c.startedAt) < minAge {
262+
return
263+
}
264+
}
265+
246266
if c.metadata.image != "" || !c.metadata.systemd.IsEmpty() {
247267
ch <- gauge(metrics.ContainerInfo, 1, c.metadata.image, c.metadata.systemd.TriggeredBy, c.metadata.systemd.Type)
248268
}
@@ -258,7 +278,6 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
258278
}
259279

260280
if taskstatsClient != nil {
261-
c.updateDelays()
262281
ch <- counter(metrics.CPUDelay, float64(c.delays.cpu)/float64(time.Second))
263282
ch <- counter(metrics.DiskDelay, float64(c.delays.disk)/float64(time.Second))
264283
}
@@ -475,9 +494,7 @@ func (c *Container) onProcessStart(pid uint32) *Process {
475494
return p
476495
}
477496

478-
func (c *Container) onProcessExit(pid uint32, oomKill bool) {
479-
c.lock.Lock()
480-
defer c.lock.Unlock()
497+
func (c *Container) onProcessExitLocked(pid uint32, oomKill bool) {
481498
if p := c.processes[pid]; p != nil {
482499
p.Close()
483500
}
@@ -491,6 +508,12 @@ func (c *Container) onProcessExit(pid uint32, oomKill bool) {
491508
}
492509
}
493510

511+
func (c *Container) onProcessExit(pid uint32, oomKill bool) {
512+
c.lock.Lock()
513+
defer c.lock.Unlock()
514+
c.onProcessExitLocked(pid, oomKill)
515+
}
516+
494517
func (c *Container) onFileOpen(pid uint32, fd uint64, mnt uint64, log bool) {
495518
if mnt > 0 && !log {
496519
c.lock.Lock()
@@ -852,10 +875,12 @@ func (c *Container) onRetransmission(src netaddr.IPPort, dst netaddr.IPPort) boo
852875
return true
853876
}
854877

855-
func (c *Container) updateDelays() {
878+
func (c *Container) updateDelaysLocked() []uint32 {
879+
var deadPids []uint32
856880
for pid := range c.processes {
857881
stats, err := TaskstatsTGID(pid)
858882
if err != nil {
883+
deadPids = append(deadPids, pid)
859884
continue
860885
}
861886
d := c.delaysByPid[pid]
@@ -865,6 +890,7 @@ func (c *Container) updateDelays() {
865890
d.disk = stats.BlockIODelay
866891
c.delaysByPid[pid] = d
867892
}
893+
return deadPids
868894
}
869895

870896
func (c *Container) updateJvmProfilingStats(u *ProfilingUpdate) {

flags/flags.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ var (
2323

2424
ContainerAllowlist = kingpin.Flag("container-allowlist", "List of allowed containers (regex patterns)").Envar("CONTAINER_ALLOWLIST").Strings()
2525
ContainerDenylist = kingpin.Flag("container-denylist", "List of denied containers (regex patterns)").Envar("CONTAINER_DENYLIST").Strings()
26+
MinContainerAge = kingpin.Flag("min-container-age", "Don't report metrics for containers younger than this. Suppresses short-lived job/cronjob pods that produce high-cardinality series. 0 disables.").Default("30s").Envar("MIN_CONTAINER_AGE").Duration()
2627

2728
SkipSystemdSystemServices = kingpin.Flag("skip-systemd-system-services", "Skip well-known systemd system services (apt, motd, udev, etc.)").Default("true").Envar("SKIP_SYSTEMD_SYSTEM_SERVICES").Bool()
2829

0 commit comments

Comments
 (0)