Skip to content

Commit 4ae70a8

Browse files
committed
metrics: Add debug metric for non-critical errors
Right now, tetragon_errors_total counts things that aren't really errors, like ProcessMetadataUsernameIgnoredNotInHost. This happens a lot in containers and isn't something we usually need to fix, but it's spamming the error metrics. This change adds a new tetragon_debug_events_total metric for these kinds of 'casual fails'. I moved ProcessMetadataUsernameIgnoredNotInHost over to this new debug metric so the main error count is actually useful for spotting real problems. Signed-off-by: Aritra Dey <[email protected]>
1 parent 2649af9 commit 4ae70a8

File tree

4 files changed

+75
-9
lines changed

4 files changed

+75
-9
lines changed

docs/content/en/docs/reference/metrics.md

Lines changed: 9 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// Copyright Authors of Tetragon
3+
4+
package errormetrics
5+
6+
import (
7+
"maps"
8+
"slices"
9+
10+
"github.com/prometheus/client_golang/prometheus"
11+
12+
"github.com/cilium/tetragon/pkg/metrics"
13+
"github.com/cilium/tetragon/pkg/metrics/consts"
14+
)
15+
16+
type DebugType int
17+
18+
const (
19+
// The username resolution was skipped since the process is not in host
20+
// namespaces.
21+
ProcessMetadataUsernameIgnoredNotInHost DebugType = iota
22+
)
23+
24+
var debugTypeLabelValues = map[DebugType]string{
25+
ProcessMetadataUsernameIgnoredNotInHost: "process_metadata_username_ignored_not_in_host_namespaces",
26+
}
27+
28+
func (e DebugType) String() string {
29+
return debugTypeLabelValues[e]
30+
}
31+
32+
var (
33+
// Constrained label for debug type
34+
debugTypeLabel = metrics.ConstrainedLabel{
35+
Name: "type",
36+
Values: slices.Collect(maps.Values(debugTypeLabelValues)),
37+
}
38+
39+
DebugTotal = metrics.MustNewCounter(
40+
metrics.NewOpts(
41+
consts.MetricsNamespace, "", "debug_events_total",
42+
"The total number of Tetragon debug events. For internal use only.",
43+
nil, []metrics.ConstrainedLabel{debugTypeLabel}, nil,
44+
),
45+
nil,
46+
)
47+
)
48+
49+
// Get a new handle on a DebugTotal metric for a DebugType
50+
func GetDebugTotal(er DebugType) prometheus.Counter {
51+
return DebugTotal.WithLabelValues(er.String())
52+
}
53+
54+
// Increment a DebugTotal for a DebugType
55+
func DebugTotalInc(er DebugType) {
56+
GetDebugTotal(er).Inc()
57+
}

pkg/metrics/errormetrics/errormetrics.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,12 @@ const (
2424
EventFinalizeProcessInfoFailed
2525
// Failed to resolve Process uid to username
2626
ProcessMetadataUsernameFailed
27-
// The username resolution was skipped since the process is not in host
28-
// namespaces.
29-
ProcessMetadataUsernameIgnoredNotInHost
3027
)
3128

3229
var errorTypeLabelValues = map[ErrorType]string{
33-
ProcessPidTidMismatch: "process_pid_tid_mismatch",
34-
EventFinalizeProcessInfoFailed: "event_finalize_process_info_failed",
35-
ProcessMetadataUsernameFailed: "process_metadata_username_failed",
36-
ProcessMetadataUsernameIgnoredNotInHost: "process_metadata_username_ignored_not_in_host_namespaces",
30+
ProcessPidTidMismatch: "process_pid_tid_mismatch",
31+
EventFinalizeProcessInfoFailed: "event_finalize_process_info_failed",
32+
ProcessMetadataUsernameFailed: "process_metadata_username_failed",
3733
}
3834

3935
func (e ErrorType) String() string {
@@ -106,6 +102,7 @@ var (
106102
func RegisterMetrics(group metrics.Group) {
107103
group.MustRegister(ErrorTotal)
108104
group.MustRegister(HandlerErrors)
105+
group.MustRegister(DebugTotal)
109106
}
110107

111108
func InitMetrics() {
@@ -121,6 +118,10 @@ func InitMetrics() {
121118
// NB: We initialize only ops.MSG_OP_UNDEF here, but unknown_opcode can occur for any opcode
122119
// that is not explicitly handled.
123120
GetHandlerErrors(ops.MSG_OP_UNDEF, HandlePerfUnknownOp).Add(0)
121+
122+
for er := range debugTypeLabelValues {
123+
GetDebugTotal(er).Add(0)
124+
}
124125
}
125126

126127
// Get a new handle on an ErrorTotal metric for an ErrorType

pkg/sensors/exec/userinfo/userinfo.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func MsgToExecveAccountUnix(unix *processapi.MsgExecveEventUnix) error {
4242
}
4343

4444
if errors.Is(err, ErrNotInHostNs) {
45-
errormetrics.ErrorTotalInc(errormetrics.ProcessMetadataUsernameIgnoredNotInHost)
45+
errormetrics.DebugTotalInc(errormetrics.ProcessMetadataUsernameIgnoredNotInHost)
4646
} else {
4747
errormetrics.ErrorTotalInc(errormetrics.ProcessMetadataUsernameFailed)
4848
}

0 commit comments

Comments
 (0)