Skip to content

Commit 0719073

Browse files
ryanroldsnfuden
andauthored
Remove status metrics for deleted resources (#10597)
Co-authored-by: Nathan Fudenberg <[email protected]> Co-authored-by: changelog-bot <changelog-bot>
1 parent e1a5574 commit 0719073

25 files changed

+520
-27
lines changed

.github/workflows/pr-kubernetes-tests.yaml

+15-8
Original file line numberDiff line numberDiff line change
@@ -55,40 +55,47 @@ jobs:
5555
# NOTE: We use the GitHub action step time (as opposed to the `go test` time), because it is easier to capture
5656

5757
test:
58-
# Dec 4, 2024: 22 minutes
58+
# 2024-12-04: 22m
59+
# 2025-02-13: 29m3s
5960
- cluster-name: 'cluster-one'
6061
go-test-args: '-v -timeout=25m'
6162
go-test-run-regex: '^TestK8sGateway$$/^RouteDelegation$$|^TestGlooctlGlooGatewayEdgeGateway$$|^TestGlooctlK8sGateway$$'
6263

63-
# Dec 4, 2024: 23 minutes
64+
# 2024-12-04: 23m
65+
# 2025-02-13: 30m30s
6466
- cluster-name: 'cluster-two'
6567
go-test-args: '-v -timeout=25m'
6668
go-test-run-regex: '^TestK8sGatewayIstioRevision$$|^TestRevisionIstioRegression$$|^TestK8sGateway$$/^Deployer$$|^TestK8sGateway$$/^RouteOptions$$|^TestK8sGateway$$/^VirtualHostOptions$$|^TestK8sGateway$$/^Upstreams$$|^TestK8sGateway$$/^HeadlessSvc$$|^TestK8sGateway$$/^PortRouting$$|^TestK8sGatewayMinimalDefaultGatewayParameters$$|^TestK8sGateway$$/^DirectResponse$$|^TestK8sGateway$$/^HttpListenerOptions$$|^TestK8sGateway$$/^ListenerOptions$$|^TestK8sGateway$$/^GlooAdminServer$$'
6769

68-
# Dec 4, 2024: 24 minutes
70+
# 2024-12-04: 24m
71+
# 2025-02-13: 31m49s
6972
- cluster-name: 'cluster-three'
7073
go-test-args: '-v -timeout=30m'
7174
go-test-run-regex: '(^TestK8sGatewayIstioAutoMtls$$|^TestAutomtlsIstioEdgeApisGateway$$|^TestIstioEdgeApiGateway$$|^TestIstioRegression$$)'
7275

73-
# Dec 4, 2024: 21 minutes
76+
# 2024-12-04: 21m
77+
# 2025-02-13: 28m3s
7478
- cluster-name: 'cluster-four'
7579
go-test-args: '-v -timeout=30m'
7680
go-test-run-regex: '(^TestK8sGatewayIstio$$|^TestGlooGatewayEdgeGateway$$|^TestGlooctlIstioInjectEdgeApiGateway$$)'
7781

78-
# Dec 4, 2024: 24 minutes
82+
# 2024-12-04: 24m
83+
# 2025-02-13: 35m21s
7984
- cluster-name: 'cluster-five'
8085
go-test-args: '-v -timeout=30m'
8186
go-test-run-regex: '^TestFullEnvoyValidation$$|^TestValidationStrict$$|^TestValidationAlwaysAccept$$|^TestTransformationValidationDisabled$$'
8287

83-
# Dec 4, 2024: 26 minutes
88+
# 2024-12-04: 26m
89+
# 2025-02-13: 33m19s
8490
- cluster-name: 'cluster-six'
8591
go-test-args: '-v -timeout=30m'
8692
go-test-run-regex: '^TestDiscoveryWatchlabels$$|^TestK8sGatewayNoValidation$$|^TestHelm$$|^TestHelmSettings$$|^TestK8sGatewayAws$$|^TestK8sGateway$$/^HTTPRouteServices$$|^TestK8sGateway$$/^TCPRouteServices$$'
8793

88-
# Dec 4, 2024: 16 minutes
94+
# 2024-12-04: 16m
95+
# 2025-02-13: 26m29s
8996
- cluster-name: 'cluster-seven'
9097
go-test-args: '-v -timeout=25m'
91-
go-test-run-regex: '^TestK8sGateway$$/^CRDCategories$$|^TestK8sGateway$$/^Metrics$$|^TestGloomtlsGatewayEdgeGateway$$|^TestGloomtlsGatewayK8sGateway$$|^TestWatchNamespaceSelector$$'
98+
go-test-run-regex: '^TestK8sGateway$$/^CRDCategories$$|^TestK8sGateway$$/^Metrics$$|^TestGloomtlsGatewayEdgeGateway$$|^TestGloomtlsGatewayK8sGateway$$|^TestGlooGatewayEdgeGatewayClearMetrics$$|^TestWatchNamespaceSelector$$'
9299

93100
# In our PR tests, we run the suite of tests using the upper ends of versions that we claim to support
94101
# The versions should mirror: https://docs.solo.io/gloo-edge/latest/reference/support/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
changelog:
2+
- type: FIX
3+
description: |
4+
Added the ability to control if resource status metrics are no longer reported after deletion.
5+
If a resource was invalid and deleted, a status metric indicating a problem was left behind.
6+
This could lead to confusion and false alarms.
7+
8+
Setting `.Values.gloo.clearStatusMetrics` to `true` will result in metrics
9+
for deleted resources no longer being reported.
10+
This may cause metric scraping to infrequently not see status metrics.
11+
issueLink: https://github.com/kgateway-dev/kgateway/issues/6938
12+
resolvesIssue: false

docs/content/reference/values.txt

+1
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,7 @@
515515
|gloo.headerSecretRefNsMatchesUs|bool||Set to true to require that secrets sent in headers via headerSecretRefs come from the same namespace as the destination upstream. Default: false|
516516
|gloo.podDisruptionBudget.minAvailable|string||Corresponds directly with the _minAvailable_ field in the [PodDisruptionBudgetSpec](https://kubernetes.io/docs/reference/kubernetes-api/policy-resources/pod-disruption-budget-v1/#PodDisruptionBudgetSpec). This value is mutually exclusive with _maxUnavailable_.|
517517
|gloo.podDisruptionBudget.maxUnavailable|string||Corresponds directly with the _maxUnavailable_ field in the [PodDisruptionBudgetSpec](https://kubernetes.io/docs/reference/kubernetes-api/policy-resources/pod-disruption-budget-v1/#PodDisruptionBudgetSpec). This value is mutually exclusive with _minAvailable_.|
518+
|gloo.clearStatusMetrics|bool||Set to true to clear status metrics for deleted resources. This may cause metric scraping to infrequently not see status metrics. Default is false.|
518519
|discovery.deployment.image.tag|string|<release_version, ex: 1.2.3>|The image tag for the container.|
519520
|discovery.deployment.image.repository|string|discovery|The image repository (name) for the container.|
520521
|discovery.deployment.image.digest|string||The container image's hash digest (e.g. 'sha256:12345...'), consumed when variant=standard.|

docs/content/static/content/osa_provided.md

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ Name|Version|License
4040
[onsi/gomega](https://github.com/onsi/gomega)|v1.35.0|MIT License
4141
[pkg/browser](https://github.com/pkg/browser)|v0.0.0-20180916011732-0a3d74bf9ce4|BSD 2-clause "Simplified" License
4242
[pkg/errors](https://github.com/pkg/errors)|v0.9.1|BSD 2-clause "Simplified" License
43+
[prometheus/client_model](https://github.com/prometheus/client_model)|v0.6.1|Apache License 2.0
44+
[prometheus/common](https://github.com/prometheus/common)|v0.60.1|Apache License 2.0
4345
[go-ruleguard/dsl](https://github.com/quasilyte/go-ruleguard)|v0.3.22|BSD 3-clause "New" or "Revised" License
4446
[rotisserie/eris](https://github.com/rotisserie/eris)|v0.5.4|MIT License
4547
[saiskee/gettercheck](https://github.com/saiskee/gettercheck)|v0.0.0-20210820204958-38443d06ebe0|MIT License

go.mod

+2-2
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ require (
9898
github.com/google/go-cmp v0.6.0
9999
github.com/google/uuid v1.6.0
100100
github.com/mccutchen/go-httpbin/v2 v2.15.0
101+
github.com/prometheus/client_model v0.6.1
102+
github.com/prometheus/common v0.60.1
101103
github.com/quasilyte/go-ruleguard/dsl v0.3.22
102104
github.com/stoewer/go-strcase v1.3.0
103105
github.com/stretchr/testify v1.9.0
@@ -283,8 +285,6 @@ require (
283285
github.com/planetscale/vtprotobuf v0.6.1-0.20240409071808-615f978279ca // indirect
284286
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
285287
github.com/prometheus/client_golang v1.20.5 // indirect
286-
github.com/prometheus/client_model v0.6.1 // indirect
287-
github.com/prometheus/common v0.60.1 // indirect
288288
github.com/prometheus/procfs v0.15.1 // indirect
289289
github.com/prometheus/statsd_exporter v0.21.0 // indirect
290290
github.com/pseudomuto/protoc-gen-doc v1.5.1 // indirect

install/helm/gloo/generate/values.go

+1
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ type Gloo struct {
313313
DisableLeaderElection *bool `json:"disableLeaderElection,omitempty" desc:"Set to true to disable leader election, and ensure all running replicas are considered the leader. Do not enable this with multiple replicas of Gloo"`
314314
HeaderSecretRefNsMatchesUs *bool `json:"headerSecretRefNsMatchesUs,omitempty" desc:"Set to true to require that secrets sent in headers via headerSecretRefs come from the same namespace as the destination upstream. Default: false"`
315315
PodDisruptionBudget *PodDisruptionBudget `json:"podDisruptionBudget,omitempty"`
316+
ClearStatusMetrics *bool `json:"clearStatusMetrics,omitempty" desc:"Set to true to clear status metrics for deleted resources. This may cause metric scraping to infrequently not see status metrics. Default is false."`
316317
}
317318

318319
type KubeGateway struct {

install/helm/gloo/templates/1-gloo-deployment.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,10 @@ spec:
277277
- name: GLOO_MTLS_SDS_ENABLED
278278
value: "true"
279279
{{- end }}
280+
{{- if .Values.gloo.clearStatusMetrics }}
281+
- name: GLOO_CLEAR_STATUS_METRICS
282+
value: "true"
283+
{{- end }}
280284
{{- if not .Values.global.glooMtls.enabled }}
281285
readinessProbe:
282286
tcpSocket:

pkg/utils/kubeutils/portforward/api_forwarder.go

+5
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,11 @@ func (f *apiPortForwarder) Address() string {
129129
return net.JoinHostPort(f.properties.localAddress, strconv.Itoa(f.properties.localPort))
130130
}
131131

132+
// LocalPort returns the local port that is being forwarded to the remote port
133+
func (f *apiPortForwarder) LocalPort() int {
134+
return f.properties.localPort
135+
}
136+
132137
func (f *apiPortForwarder) Close() {
133138
close(f.stopCh)
134139
// Closing the stop channel should close anything

pkg/utils/kubeutils/portforward/cli_portforwarder.go

+5
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ func (c *cliPortForwarder) Address() string {
111111
return net.JoinHostPort(c.properties.localAddress, strconv.Itoa(c.properties.localPort))
112112
}
113113

114+
// LocalPort returns the local port that is being forwarded to the remote port
115+
func (c *cliPortForwarder) LocalPort() int {
116+
return c.properties.localPort
117+
}
118+
114119
func (c *cliPortForwarder) Close() {
115120
if c.cmdCancel != nil {
116121
c.cmdCancel()

pkg/utils/kubeutils/portforward/portforwarder.go

+3
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ type PortForwarder interface {
1818
// Address returns the local forwarded address. Only valid while the apiPortForwarder is running.
1919
Address() string
2020

21+
// Local port that is being forwarded to the remote port
22+
LocalPort() int
23+
2124
// Close this apiPortForwarder and release any resources.
2225
Close()
2326

pkg/utils/statsutils/metrics/metrics.go

+61-7
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,27 @@ import (
44
"context"
55
"fmt"
66
"strings"
7+
"time"
78

89
errors "github.com/rotisserie/eris"
10+
"github.com/solo-io/gloo/pkg/utils/envutils"
911
"github.com/solo-io/gloo/pkg/utils/statsutils"
1012
gwv1 "github.com/solo-io/gloo/projects/gateway/pkg/api/v1"
1113
gloov1 "github.com/solo-io/gloo/projects/gloo/pkg/api/v1"
1214
"github.com/solo-io/go-utils/contextutils"
1315
"github.com/solo-io/solo-kit/pkg/api/v1/resources"
1416
"github.com/solo-io/solo-kit/pkg/api/v1/resources/core"
1517
"go.opencensus.io/stats"
18+
"go.opencensus.io/stats/view"
1619
"go.opencensus.io/tag"
1720
"k8s.io/apimachinery/pkg/runtime/schema"
1821
"k8s.io/client-go/util/jsonpath"
1922
)
2023

24+
const (
25+
ClearStatusMetricsEnvVar = "GLOO_CLEAR_STATUS_METRICS"
26+
)
27+
2128
type MetricLabels = gloov1.Settings_ObservabilityOptions_MetricLabels
2229

2330
var Names = map[schema.GroupVersionKind]string{
@@ -43,6 +50,7 @@ var descriptions = map[schema.GroupVersionKind]string{
4350
// ConfigStatusMetrics is a collection of metrics, each of which records if the configuration for
4451
// a particular resource type is valid
4552
type ConfigStatusMetrics struct {
53+
opts map[string]*MetricLabels
4654
metrics map[schema.GroupVersionKind]*resourceMetric
4755
}
4856

@@ -60,21 +68,33 @@ func GetDefaultConfigStatusOptions() map[string]*MetricLabels {
6068
// NewConfigStatusMetrics creates and returns a ConfigStatusMetrics from the specified options.
6169
// If the options are invalid, an error is returned.
6270
func NewConfigStatusMetrics(opts map[string]*MetricLabels) (ConfigStatusMetrics, error) {
71+
metrics, err := prepareMetrics(opts)
72+
if err != nil {
73+
return ConfigStatusMetrics{}, err
74+
}
75+
6376
configMetrics := ConfigStatusMetrics{
64-
metrics: make(map[schema.GroupVersionKind]*resourceMetric),
77+
opts: opts,
78+
metrics: metrics,
6579
}
80+
81+
return configMetrics, nil
82+
}
83+
84+
func prepareMetrics(opts map[string]*MetricLabels) (map[schema.GroupVersionKind]*resourceMetric, error) {
85+
metrics := make(map[schema.GroupVersionKind]*resourceMetric)
6686
for gvkString, labels := range opts {
6787
gvk, err := parseGroupVersionKind(gvkString)
6888
if err != nil {
69-
return ConfigStatusMetrics{}, err
89+
return map[schema.GroupVersionKind]*resourceMetric{}, err
7090
}
7191
metric, err := newResourceMetric(gvk, labels.GetLabelToPath())
7292
if err != nil {
73-
return ConfigStatusMetrics{}, err
93+
return map[schema.GroupVersionKind]*resourceMetric{}, err
7494
}
75-
configMetrics.insertMetric(gvk, metric)
95+
metrics[gvk] = metric
7696
}
77-
return configMetrics, nil
97+
return metrics, nil
7898
}
7999

80100
func parseGroupVersionKind(arg string) (schema.GroupVersionKind, error) {
@@ -156,8 +176,42 @@ func (m *ConfigStatusMetrics) SetResourceInvalid(ctx context.Context, resource r
156176
}
157177
}
158178

159-
func (m *ConfigStatusMetrics) insertMetric(gvk schema.GroupVersionKind, metric *resourceMetric) {
160-
m.metrics[gvk] = metric
179+
// ClearMetrics removes all metrics from the ConfigStatusMetrics
180+
func (m *ConfigStatusMetrics) ClearMetrics(ctx context.Context) {
181+
// Our current metrics package uses a channel to unregister views,
182+
// forcing callers sleep after calling ClearMetrics.
183+
// We are concerned that required sleep may cause metrics to flicker.
184+
// So, we are making this behavior opt-in.
185+
// This is a temporary solution until we upgrade to another metrics package.
186+
if !envutils.IsEnvTruthy(ClearStatusMetricsEnvVar) {
187+
return
188+
}
189+
190+
someViewsUnregistered := false
191+
192+
// Iterate through the resource metrics and unregister them
193+
for _, metric := range m.metrics {
194+
v := view.Find(metric.gauge.Name())
195+
if v != nil {
196+
view.Unregister(v)
197+
someViewsUnregistered = true
198+
}
199+
}
200+
201+
// Only sleep when some metrics were unregistered
202+
if someViewsUnregistered {
203+
// Wait for the view to be unregistered (a channel is used)
204+
// This is necessary because the view is unregistered asynchronously.
205+
// We may not need this after we upgrade to an newer metrics package
206+
time.Sleep(1 * time.Second)
207+
}
208+
209+
// Add fresh metrics
210+
var err error
211+
m.metrics, err = prepareMetrics(m.opts)
212+
if err != nil {
213+
contextutils.LoggerFrom(ctx).Errorf("Error clearing resource metrics: %s", err.Error())
214+
}
161215
}
162216

163217
func getMutators(metric *resourceMetric, resource resources.Resource) ([]tag.Mutator, error) {

pkg/utils/statsutils/metrics/metrics_test.go

+69
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package metrics_test
22

33
import (
44
"context"
5+
"os"
56

67
. "github.com/onsi/ginkgo/v2"
78
. "github.com/onsi/gomega"
@@ -131,4 +132,72 @@ var _ = Describe("ConfigStatusMetrics Test", func() {
131132
Entry("Secret", "Secret.v1.gloo.solo.io", metrics.Names[gloov1.SecretGVK], makeSecret),
132133
Entry("Proxy", "Proxy.v1.gloo.solo.io", metrics.Names[gloov1.ProxyGVK], makeProxy),
133134
)
135+
136+
Describe("ClearMetrics", func() {
137+
previousValue := os.Getenv(metrics.ClearStatusMetricsEnvVar)
138+
139+
AfterEach(func() {
140+
os.Setenv(metrics.ClearStatusMetricsEnvVar, previousValue)
141+
})
142+
143+
It("should clear metrics", func() {
144+
os.Setenv(metrics.ClearStatusMetricsEnvVar, "true")
145+
146+
metricName, ok := metrics.Names[gwv1.VirtualServiceGVK]
147+
Expect(ok).To(BeTrue())
148+
149+
opts := map[string]*metrics.MetricLabels{
150+
"VirtualService.v1.gateway.solo.io": {
151+
LabelToPath: map[string]string{
152+
"name": "{.metadata.name}",
153+
},
154+
},
155+
}
156+
c, err := metrics.NewConfigStatusMetrics(opts)
157+
Expect(err).NotTo(HaveOccurred())
158+
Expect(c).NotTo(BeNil())
159+
160+
// Create two resources
161+
res := makeVirtualService("clear")
162+
resName := res.GetMetadata().GetName()
163+
164+
c.SetResourceInvalid(context.TODO(), res)
165+
Expect(helpers.ReadMetricByLabel(metricName, "name", resName)).To(Equal(1))
166+
167+
c.ClearMetrics(context.TODO())
168+
_, err = helpers.ReadMetricByLabel(metricName, "name", resName)
169+
Expect(err).To(HaveOccurred())
170+
})
171+
172+
It("should not clear metrics if the environment variable is not set", func() {
173+
os.Unsetenv(metrics.ClearStatusMetricsEnvVar)
174+
175+
metricName, ok := metrics.Names[gwv1.VirtualServiceGVK]
176+
Expect(ok).To(BeTrue())
177+
178+
opts := map[string]*metrics.MetricLabels{
179+
"VirtualService.v1.gateway.solo.io": {
180+
LabelToPath: map[string]string{
181+
"name": "{.metadata.name}",
182+
},
183+
},
184+
}
185+
c, err := metrics.NewConfigStatusMetrics(opts)
186+
Expect(err).NotTo(HaveOccurred())
187+
Expect(c).NotTo(BeNil())
188+
189+
// Create two resources
190+
res := makeVirtualService("clear")
191+
resName := res.GetMetadata().GetName()
192+
193+
c.SetResourceInvalid(context.TODO(), res)
194+
Expect(helpers.ReadMetricByLabel(metricName, "name", resName)).To(Equal(1))
195+
196+
c.ClearMetrics(context.TODO())
197+
198+
v, err := helpers.ReadMetricByLabel(metricName, "name", resName)
199+
Expect(err).NotTo(HaveOccurred())
200+
Expect(v).To(Equal(1))
201+
})
202+
})
134203
})

pkg/utils/statsutils/statsutils.go

+1-5
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,7 @@ func MeasureOne(ctx context.Context, counter *stats.Int64Measure, tags ...tag.Mu
4646

4747
// Measure records the given value to the given counter
4848
func Measure(ctx context.Context, counter *stats.Int64Measure, val int64, tags ...tag.Mutator) {
49-
if err := stats.RecordWithTags(
50-
ctx,
51-
tags,
52-
counter.M(val),
53-
); err != nil {
49+
if err := stats.RecordWithTags(ctx, tags, counter.M(val)); err != nil {
5450
contextutils.LoggerFrom(ctx).Errorf("setting counter %v: %v", counter.Name(), err)
5551
}
5652
}

0 commit comments

Comments
 (0)