Skip to content

Commit cc8f667

Browse files
committed
admission: per-store work queue metrics
This patch splits store work queue metrics to be per-store. It also does some general clean up around the `storeGrantCoordinators` code to centralize initialization of requesters and metrics. Fixes #131562 Release note: None
1 parent 70c83cc commit cc8f667

File tree

9 files changed

+326
-253
lines changed

9 files changed

+326
-253
lines changed

pkg/kv/kvserver/stores.go

+9
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2222
"github.com/cockroachdb/cockroach/pkg/util/limit"
2323
"github.com/cockroachdb/cockroach/pkg/util/log"
24+
"github.com/cockroachdb/cockroach/pkg/util/metric"
2425
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
2526
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2627
"github.com/cockroachdb/errors"
@@ -326,3 +327,11 @@ func (ls *Stores) CloseDiskMonitors() {
326327
return nil
327328
})
328329
}
330+
331+
// GetStoreMetricRegistry returns the metric registry of the provided store ID.
332+
func (ls *Stores) GetStoreMetricRegistry(storeID roachpb.StoreID) *metric.Registry {
333+
if s, ok := ls.storeMap.Load(storeID); ok {
334+
return s.Registry()
335+
}
336+
return nil
337+
}

pkg/server/node.go

+17-1
Original file line numberDiff line numberDiff line change
@@ -1357,6 +1357,10 @@ func (n *Node) registerEnginesForDiskStatsMap(
13571357
return pmp, nil
13581358
}
13591359

1360+
func (n *Node) makeStoreRegistryProvider() admission.MetricsRegistryProvider {
1361+
return &storeMetricsRegistryProvider{n: n}
1362+
}
1363+
13601364
type nodePebbleMetricsProvider struct {
13611365
n *Node
13621366
diskStatsMap diskStatsMap
@@ -1382,7 +1386,8 @@ func (pmp *nodePebbleMetricsProvider) GetPebbleMetrics() []admission.StoreMetric
13821386
StoreID: store.StoreID(),
13831387
Metrics: m.Metrics,
13841388
WriteStallCount: m.WriteStallCount,
1385-
DiskStats: diskStats})
1389+
DiskStats: diskStats,
1390+
})
13861391
return nil
13871392
})
13881393
return metrics
@@ -1393,6 +1398,17 @@ func (pmp *nodePebbleMetricsProvider) Close() {
13931398
pmp.diskStatsMap.closeDiskMonitors()
13941399
}
13951400

1401+
type storeMetricsRegistryProvider struct {
1402+
n *Node
1403+
}
1404+
1405+
// GetMetricsRegistry implements admission.MetricsRegistryProvider.
1406+
func (mrp *storeMetricsRegistryProvider) GetMetricsRegistry(
1407+
storeID roachpb.StoreID,
1408+
) *metric.Registry {
1409+
return mrp.n.stores.GetStoreMetricRegistry(storeID)
1410+
}
1411+
13961412
// GetTenantWeights implements kvserver.TenantWeightProvider.
13971413
func (n *Node) GetTenantWeights() kvadmission.TenantWeights {
13981414
weights := kvadmission.TenantWeights{

pkg/server/server.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -1954,6 +1954,10 @@ func (s *topLevelServer) PreStart(ctx context.Context) error {
19541954
return errors.Wrapf(err, "failed to register engines for the disk stats map")
19551955
}
19561956

1957+
// Set up a store metrics registry provider to register AC store-level
1958+
// metrics.
1959+
mrp := s.node.makeStoreRegistryProvider()
1960+
19571961
// Stores have been initialized, so Node can now provide Pebble metrics.
19581962
//
19591963
// Note that all existing stores will be operational before Pebble-level
@@ -1962,7 +1966,7 @@ func (s *topLevelServer) PreStart(ctx context.Context) error {
19621966
// existing stores shouldn’t be able to acquire leases yet. Although, below
19631967
// Raft commands like log application and snapshot application may be able
19641968
// to bypass admission control.
1965-
s.storeGrantCoords.SetPebbleMetricsProvider(ctx, pmp, s.node)
1969+
s.storeGrantCoords.SetPebbleMetricsProvider(ctx, pmp, mrp, s.node)
19661970

19671971
// Once all stores are initialized, check if offline storage recovery
19681972
// was done prior to start and record any actions appropriately.

pkg/ui/workspaces/db-console/src/views/cluster/containers/nodeGraphs/dashboards/overload.tsx

+58-69
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,8 @@ import React from "react";
99
import LineGraph from "src/views/cluster/components/linegraph";
1010
import { Metric, Axis } from "src/views/shared/components/metricQuery";
1111

12-
import {
13-
GraphDashboardProps,
14-
nodeDisplayName,
15-
storeIDsForNode,
16-
} from "./dashboardUtils";
12+
import { GraphDashboardProps, nodeDisplayName } from "./dashboardUtils";
13+
import { storeMetrics } from "./storeUtils";
1714

1815
export default function (props: GraphDashboardProps) {
1916
const {
@@ -66,36 +63,32 @@ export default function (props: GraphDashboardProps) {
6663

6764
<LineGraph
6865
title="Admission IO Tokens Exhausted Duration Per Second"
69-
sources={nodeSources}
66+
sources={storeSources}
7067
tenantSource={tenantSource}
7168
showMetricsInTooltip={true}
7269
tooltip={`Relative time the node had exhausted IO tokens for all IO-bound work per second of wall time, measured in microseconds/second. Increased IO token exhausted duration indicates IO resource exhaustion.`}
7370
>
7471
<Axis label="Duration (micros/sec)">
75-
{nodeIDs.map(nid => (
76-
<>
77-
<Metric
78-
key={nid}
79-
name="cr.node.admission.granter.io_tokens_exhausted_duration.kv"
80-
title={
81-
"Regular (Foreground) " +
82-
nodeDisplayName(nodeDisplayNameByID, nid)
83-
}
84-
sources={[nid]}
85-
nonNegativeRate
86-
/>
87-
<Metric
88-
key={nid}
89-
name="cr.node.admission.granter.elastic_io_tokens_exhausted_duration.kv"
90-
title={
91-
"Elastic (Background) " +
92-
nodeDisplayName(nodeDisplayNameByID, nid)
93-
}
94-
sources={[nid]}
95-
nonNegativeRate
96-
/>
97-
</>
98-
))}
72+
{storeMetrics(
73+
{
74+
name: "cr.store.admission.granter.io_tokens_exhausted_duration.kv",
75+
nonNegativeRate: true,
76+
aggregateMax: true,
77+
},
78+
nodeIDs,
79+
storeIDsByNodeID,
80+
"regular (foreground)",
81+
)}
82+
{storeMetrics(
83+
{
84+
name: "cr.store.admission.granter.elastic_io_tokens_exhausted_duration.kv",
85+
nonNegativeRate: true,
86+
aggregateMax: true,
87+
},
88+
nodeIDs,
89+
storeIDsByNodeID,
90+
"elastic (background)",
91+
)}
9992
</Axis>
10093
</LineGraph>,
10194

@@ -107,16 +100,14 @@ export default function (props: GraphDashboardProps) {
107100
showMetricsInTooltip={true}
108101
>
109102
<Axis label="Score">
110-
{nodeIDs.map(nid => (
111-
<>
112-
<Metric
113-
key={nid}
114-
name="cr.store.admission.io.overload"
115-
title={nodeDisplayName(nodeDisplayNameByID, nid)}
116-
sources={storeIDsForNode(storeIDsByNodeID, nid)}
117-
/>
118-
</>
119-
))}
103+
{storeMetrics(
104+
{
105+
name: "cr.store.admission.io.overload",
106+
aggregateMax: true,
107+
},
108+
nodeIDs,
109+
storeIDsByNodeID,
110+
)}
120111
</Axis>
121112
</LineGraph>,
122113

@@ -178,30 +169,30 @@ export default function (props: GraphDashboardProps) {
178169

179170
<LineGraph
180171
title="Admission Queueing Delay p99 – Store"
181-
sources={nodeSources}
172+
sources={storeSources}
182173
tenantSource={tenantSource}
183174
showMetricsInTooltip={true}
184175
tooltip={`The 99th percentile latency of requests waiting in the Admission Control store queue.`}
185176
>
186177
<Axis units={AxisUnits.Duration} label="Write Delay Duration">
187-
{nodeIDs.map(nid => (
188-
<>
189-
<Metric
190-
key={nid}
191-
name="cr.node.admission.wait_durations.kv-stores-p99"
192-
title={"KV " + nodeDisplayName(nodeDisplayNameByID, nid)}
193-
sources={[nid]}
194-
downsampleMax
195-
/>
196-
<Metric
197-
key={nid}
198-
name="cr.node.admission.wait_durations.elastic-stores-p99"
199-
title={"Elastic " + nodeDisplayName(nodeDisplayNameByID, nid)}
200-
sources={[nid]}
201-
downsampleMax
202-
/>
203-
</>
204-
))}
178+
{storeMetrics(
179+
{
180+
name: "cr.store.admission.wait_durations.kv-stores-p99",
181+
aggregateMax: true,
182+
},
183+
nodeIDs,
184+
storeIDsByNodeID,
185+
"KV",
186+
)}
187+
{storeMetrics(
188+
{
189+
name: "cr.store.admission.wait_durations.elastic-stores-p99",
190+
aggregateMax: true,
191+
},
192+
nodeIDs,
193+
storeIDsByNodeID,
194+
"elastic",
195+
)}
205196
</Axis>
206197
</LineGraph>,
207198

@@ -420,16 +411,14 @@ export default function (props: GraphDashboardProps) {
420411
showMetricsInTooltip={true}
421412
>
422413
<Axis label="Count">
423-
{nodeIDs.map(nid => (
424-
<>
425-
<Metric
426-
key={nid}
427-
name="cr.store.storage.l0-sublevels"
428-
title={nodeDisplayName(nodeDisplayNameByID, nid)}
429-
sources={storeIDsForNode(storeIDsByNodeID, nid)}
430-
/>
431-
</>
432-
))}
414+
{storeMetrics(
415+
{
416+
name: "cr.store.storage.l0-sublevels",
417+
aggregateMax: true,
418+
},
419+
nodeIDs,
420+
storeIDsByNodeID,
421+
)}
433422
</Axis>
434423
</LineGraph>,
435424
];

0 commit comments

Comments
 (0)