-
Notifications
You must be signed in to change notification settings - Fork 4.6k
rls: update rls cache metrics to use async gauge framework #8808
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -79,14 +79,14 @@ var ( | |
| dataCachePurgeHook = func() {} | ||
| resetBackoffHook = func() {} | ||
|
|
||
| cacheEntriesMetric = estats.RegisterInt64Gauge(estats.MetricDescriptor{ | ||
| cacheEntriesMetric = estats.RegisterInt64AsyncGauge(estats.MetricDescriptor{ | ||
| Name: "grpc.lb.rls.cache_entries", | ||
| Description: "EXPERIMENTAL. Number of entries in the RLS cache.", | ||
| Unit: "{entry}", | ||
| Labels: []string{"grpc.target", "grpc.lb.rls.server_target", "grpc.lb.rls.instance_uuid"}, | ||
| Default: false, | ||
| }) | ||
| cacheSizeMetric = estats.RegisterInt64Gauge(estats.MetricDescriptor{ | ||
| cacheSizeMetric = estats.RegisterInt64AsyncGauge(estats.MetricDescriptor{ | ||
| Name: "grpc.lb.rls.cache_size", | ||
| Description: "EXPERIMENTAL. The current size of the RLS cache.", | ||
| Unit: "By", | ||
|
|
@@ -140,7 +140,10 @@ func (rlsBB) Build(cc balancer.ClientConn, opts balancer.BuildOptions) balancer. | |
| updateCh: buffer.NewUnbounded(), | ||
| } | ||
| lb.logger = internalgrpclog.NewPrefixLogger(logger, fmt.Sprintf("[rls-experimental-lb %p] ", lb)) | ||
| lb.dataCache = newDataCache(maxCacheSize, lb.logger, cc.MetricsRecorder(), opts.Target.String()) | ||
| lb.dataCache = newDataCache(maxCacheSize, lb.logger, opts.Target.String()) | ||
| if metricsRecorder := cc.MetricsRecorder(); metricsRecorder != nil { | ||
| lb.metricHandler = metricsRecorder.RegisterAsyncReporter(lb, cacheEntriesMetric, cacheSizeMetric) | ||
| } | ||
| lb.bg = balancergroup.New(balancergroup.Options{ | ||
| CC: cc, | ||
| BuildOpts: opts, | ||
|
|
@@ -162,6 +165,9 @@ type rlsBalancer struct { | |
| dataCachePurgeHook func() | ||
| logger *internalgrpclog.PrefixLogger | ||
|
|
||
| // metricHandler is the function to deregister the async metric reporter. | ||
| metricHandler func() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Can we include |
||
|
|
||
| // If both cacheMu and stateMu need to be acquired, the former must be | ||
| // acquired first to prevent a deadlock. This order restriction is due to the | ||
| // fact that in places where we need to acquire both the locks, we always | ||
|
|
@@ -488,6 +494,9 @@ func (b *rlsBalancer) Close() { | |
| if b.ctrlCh != nil { | ||
| b.ctrlCh.close() | ||
| } | ||
| if b.metricHandler != nil { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should omit the |
||
| b.metricHandler() | ||
| } | ||
| b.bg.Close() | ||
| b.stateMu.Unlock() | ||
|
|
||
|
|
@@ -702,3 +711,14 @@ func (b *rlsBalancer) releaseChildPolicyReferences(targets []string) { | |
| } | ||
| b.stateMu.Unlock() | ||
| } | ||
|
|
||
| // Report reports the metrics data to the provided recorder. | ||
| func (b *rlsBalancer) Report(r estats.AsyncMetricsRecorder) error { | ||
| b.cacheMu.Lock() | ||
| defer b.cacheMu.Unlock() | ||
|
Comment on lines
+717
to
+718
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my opinion, we should avoid reporting metrics while holding the mutex. If the metrics collector is slow or hangs (due to slow I/O, for instance), it could potentially block the RLS from functioning. We can fetch the required data while holding the lock, but we should release it before calling the metrics reporter. |
||
|
|
||
| if b.dataCache == nil { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, we should omit the |
||
| return nil | ||
| } | ||
| return b.dataCache.reportMetrics(r) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,8 +24,8 @@ import ( | |
|
|
||
| "github.com/google/go-cmp/cmp" | ||
| "github.com/google/go-cmp/cmp/cmpopts" | ||
| estats "google.golang.org/grpc/experimental/stats" | ||
| "google.golang.org/grpc/internal/backoff" | ||
| "google.golang.org/grpc/internal/testutils/stats" | ||
| ) | ||
|
|
||
| var ( | ||
|
|
@@ -120,7 +120,7 @@ func (s) TestLRU_BasicOperations(t *testing.T) { | |
|
|
||
| func (s) TestDataCache_BasicOperations(t *testing.T) { | ||
| initCacheEntries() | ||
| dc := newDataCache(5, nil, &stats.NoopMetricsRecorder{}, "") | ||
| dc := newDataCache(5, nil, "") | ||
| for i, k := range cacheKeys { | ||
| dc.addEntry(k, cacheEntries[i]) | ||
| } | ||
|
|
@@ -134,7 +134,7 @@ func (s) TestDataCache_BasicOperations(t *testing.T) { | |
|
|
||
| func (s) TestDataCache_AddForcesResize(t *testing.T) { | ||
| initCacheEntries() | ||
| dc := newDataCache(1, nil, &stats.NoopMetricsRecorder{}, "") | ||
| dc := newDataCache(1, nil, "") | ||
|
|
||
| // The first entry in cacheEntries has a minimum expiry time in the future. | ||
| // This entry would stop the resize operation since we do not evict entries | ||
|
|
@@ -163,7 +163,7 @@ func (s) TestDataCache_AddForcesResize(t *testing.T) { | |
|
|
||
| func (s) TestDataCache_Resize(t *testing.T) { | ||
| initCacheEntries() | ||
| dc := newDataCache(5, nil, &stats.NoopMetricsRecorder{}, "") | ||
| dc := newDataCache(5, nil, "") | ||
| for i, k := range cacheKeys { | ||
| dc.addEntry(k, cacheEntries[i]) | ||
| } | ||
|
|
@@ -194,7 +194,7 @@ func (s) TestDataCache_Resize(t *testing.T) { | |
|
|
||
| func (s) TestDataCache_EvictExpiredEntries(t *testing.T) { | ||
| initCacheEntries() | ||
| dc := newDataCache(5, nil, &stats.NoopMetricsRecorder{}, "") | ||
| dc := newDataCache(5, nil, "") | ||
| for i, k := range cacheKeys { | ||
| dc.addEntry(k, cacheEntries[i]) | ||
| } | ||
|
|
@@ -221,7 +221,7 @@ func (s) TestDataCache_ResetBackoffState(t *testing.T) { | |
| } | ||
|
|
||
| initCacheEntries() | ||
| dc := newDataCache(5, nil, &stats.NoopMetricsRecorder{}, "") | ||
| dc := newDataCache(5, nil, "") | ||
| for i, k := range cacheKeys { | ||
| dc.addEntry(k, cacheEntries[i]) | ||
| } | ||
|
|
@@ -243,6 +243,17 @@ func (s) TestDataCache_ResetBackoffState(t *testing.T) { | |
| } | ||
| } | ||
|
|
||
| type testAsyncMetricsRecorder struct { | ||
| data map[string]int64 | ||
| } | ||
|
|
||
| func (r *testAsyncMetricsRecorder) RecordInt64AsyncGauge(h *estats.Int64AsyncGaugeHandle, v int64, _ ...string) { | ||
| if r.data == nil { | ||
| r.data = make(map[string]int64) | ||
| } | ||
|
Comment on lines
+251
to
+253
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should ensure the object is initialized with a non-nil map during creation (via a constructor or otherwise). This makes it easier to reason about the state and avoids potential |
||
| r.data[h.Descriptor().Name] = v | ||
| } | ||
|
Comment on lines
+250
to
+255
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this functionality be added to the existing |
||
|
|
||
| func (s) TestDataCache_Metrics(t *testing.T) { | ||
| cacheEntriesMetricsTests := []*cacheEntry{ | ||
| {size: 1}, | ||
|
|
@@ -251,8 +262,8 @@ func (s) TestDataCache_Metrics(t *testing.T) { | |
| {size: 4}, | ||
| {size: 5}, | ||
| } | ||
| tmr := stats.NewTestMetricsRecorder() | ||
| dc := newDataCache(50, nil, tmr, "") | ||
| tmr := &testAsyncMetricsRecorder{} | ||
| dc := newDataCache(50, nil, "") | ||
|
|
||
| dc.updateRLSServerTarget("rls-server-target") | ||
| for i, k := range cacheKeys { | ||
|
|
@@ -261,42 +272,33 @@ func (s) TestDataCache_Metrics(t *testing.T) { | |
|
|
||
| const cacheEntriesKey = "grpc.lb.rls.cache_entries" | ||
| const cacheSizeKey = "grpc.lb.rls.cache_size" | ||
| // 5 total entries which add up to 15 size, so should record that. | ||
| if got, _ := tmr.Metric(cacheEntriesKey); got != 5 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheEntriesKey, got, 5) | ||
| } | ||
| if got, _ := tmr.Metric(cacheSizeKey); got != 15 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheSizeKey, got, 15) | ||
| verifyMetrics := func(wantEntries, wantSize int64) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This closure seems like a assertions helper which is discouraged by the styleguide: https://google.github.io/styleguide/go/best-practices#leave-testing-to-the-test-function It does have useful error messages though. I would still recommend inlining the logic since it's just a few lines anyways. |
||
| t.Helper() | ||
| tmr.data = nil | ||
| dc.reportMetrics(tmr) | ||
| if got := tmr.data[cacheEntriesKey]; got != wantEntries { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheEntriesKey, got, wantEntries) | ||
| } | ||
| if got := tmr.data[cacheSizeKey]; got != wantSize { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheSizeKey, got, wantSize) | ||
| } | ||
| } | ||
|
|
||
| // 5 total entries which add up to 15 size. | ||
| verifyMetrics(5, 15) | ||
|
|
||
| // Resize down the cache to 2 entries (deterministic as based of LRU). | ||
| dc.resize(9) | ||
| if got, _ := tmr.Metric(cacheEntriesKey); got != 2 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheEntriesKey, got, 2) | ||
| } | ||
| if got, _ := tmr.Metric(cacheSizeKey); got != 9 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheSizeKey, got, 9) | ||
| } | ||
| verifyMetrics(2, 9) | ||
|
|
||
| // Update an entry to have size 6. This should reflect in the size metrics, | ||
| // which will increase by 1 to 11, while the number of cache entries should | ||
| // stay same. This write is deterministic and writes to the last one. | ||
| dc.updateEntrySize(cacheEntriesMetricsTests[4], 6) | ||
|
|
||
| if got, _ := tmr.Metric(cacheEntriesKey); got != 2 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheEntriesKey, got, 2) | ||
| } | ||
| if got, _ := tmr.Metric(cacheSizeKey); got != 10 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheSizeKey, got, 10) | ||
| } | ||
| verifyMetrics(2, 10) | ||
|
|
||
| // Delete this scaled up cache key. This should scale down the cache to 1 | ||
| // entries, and remove 6 size so cache size should be 4. | ||
| dc.deleteAndCleanup(cacheKeys[4], cacheEntriesMetricsTests[4]) | ||
| if got, _ := tmr.Metric(cacheEntriesKey); got != 1 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheEntriesKey, got, 1) | ||
| } | ||
| if got, _ := tmr.Metric(cacheSizeKey); got != 4 { | ||
| t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", cacheSizeKey, got, 4) | ||
| } | ||
| verifyMetrics(1, 4) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can remove the
nilcheck here.cc.MetricsRecorder()must always return a non-nil metrics recorder. TheClientConnimplementation provided by gRPC ensures this, and we enforce that LB policies embed a validClientConnwhen wrapping. AnilMetricsRecorderwould indicate a bug in a custom LB policy that is explicitly returningnil.