Skip to content

Commit 40b86cf

Browse files
committed
Add perfinsights middleware which writes the latency of API calls
to a new native histogram bucket Should help users determine top-line API latency
1 parent 228885b commit 40b86cf

17 files changed

+712
-41
lines changed

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ require (
121121
)
122122

123123
require (
124+
github.com/authzed/ctxkey v0.0.0-20250226155515-d49f99185584
124125
github.com/caio/go-tdigest/v4 v4.0.1
125126
github.com/golangci/golangci-lint/v2 v2.1.5
126127
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -1447,6 +1447,8 @@ github.com/authzed/cel-go v0.20.2 h1:GlmLecGry7Z8HU0k+hmaHHUV05ZHrsFxduXHtIePvck
14471447
github.com/authzed/cel-go v0.20.2/go.mod h1:pJHVFWbqUHV1J+klQoZubdKswlbxcsbojda3mye9kiU=
14481448
github.com/authzed/consistent v0.1.0 h1:tlh1wvKoRbjRhMm2P+X5WQQyR54SRoS4MyjLOg17Mp8=
14491449
github.com/authzed/consistent v0.1.0/go.mod h1:plwHlrN/EJUCwQ+Bca0MhM1KnisPs7HEkZI5giCXrcc=
1450+
github.com/authzed/ctxkey v0.0.0-20250226155515-d49f99185584 h1:mP7EpcUL90EKcf/F+gYiU2pz3cGFEuKPHPfe7MntEXk=
1451+
github.com/authzed/ctxkey v0.0.0-20250226155515-d49f99185584/go.mod h1:wnimjr5RPPouIhZQ3ztDBLMUKKuUroj3U9Jy0PxeaEE=
14501452
github.com/authzed/grpcutil v0.0.0-20240123194739-2ea1e3d2d98b h1:wbh8IK+aMLTCey9sZasO7b6BWLAJnHHvb79fvWCXwxw=
14511453
github.com/authzed/grpcutil v0.0.0-20240123194739-2ea1e3d2d98b/go.mod h1:s3qC7V7XIbiNWERv7Lfljy/Lx25/V1Qlexb0WJuA8uQ=
14521454
github.com/aws/aws-sdk-go v1.44.256/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// Package perfinsights defines middleware that reports the latency of API calls to Prometheus.
2+
//
3+
// Unlike the gRPC middleware, the perf insights middleware records API calls by "shape", versus
4+
// aggregating all calls simply by API kind. The shapes allow for more granular determination of
5+
// the latency of API calls, and can be used to determine if there are any specific
6+
// performance issues with specific API calls.
7+
package perfinsights
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
package perfinsights
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strconv"
7+
"time"
8+
9+
"github.com/ccoveille/go-safecast"
10+
"github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors"
11+
"github.com/prometheus/client_golang/prometheus"
12+
"github.com/prometheus/client_golang/prometheus/promauto"
13+
"github.com/rs/zerolog/log"
14+
"go.opentelemetry.io/otel/trace"
15+
"google.golang.org/grpc"
16+
17+
"github.com/authzed/ctxkey"
18+
"github.com/authzed/grpcutil"
19+
20+
"github.com/authzed/spicedb/pkg/spiceerrors"
21+
)
22+
23+
type APIShapeLabel string
24+
25+
const (
26+
ResourceTypeLabel APIShapeLabel = "resource_type"
27+
ResourceRelationLabel APIShapeLabel = "resource_relation"
28+
SubjectTypeLabel APIShapeLabel = "subject_type"
29+
SubjectRelationLabel APIShapeLabel = "subject_relation"
30+
NameLabel APIShapeLabel = "name"
31+
FilterLabel APIShapeLabel = "filter"
32+
)
33+
34+
var allLabels = []string{
35+
string(ResourceTypeLabel),
36+
string(ResourceRelationLabel),
37+
string(SubjectTypeLabel),
38+
string(SubjectRelationLabel),
39+
string(NameLabel),
40+
string(FilterLabel),
41+
}
42+
43+
// APIShapeLabels is a map of APIShapeLabel to any value.
44+
type APIShapeLabels map[APIShapeLabel]any
45+
46+
// NoLabels returns an empty APIShapeLabels map.
47+
func NoLabels() APIShapeLabels {
48+
return APIShapeLabels{}
49+
}
50+
51+
// APIShapeLatency is the metric that SpiceDB uses to keep track of the latency
52+
// of API calls, by shape. The "shape" of an API call is defined as the portions
53+
// of the API call that are not the specific object IDs, e.g. the resource type,
54+
// the relation, the subject type, etc.
55+
//
56+
// NOTE: This metric is a *native* histogram, which means that instead of predefined
57+
// the buckets, it uses a factor to determine the buckets. This is useful for latency-based
58+
// metrics, as the latency can vary widely and having a fixed set of buckets can lead to
59+
// imprecise results.
60+
//
61+
// To use make use of native histograms, a special flag must be set on Prometheus:
62+
// https://prometheus.io/docs/prometheus/latest/feature_flags/#native-histograms
63+
var APIShapeLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{
64+
Namespace: "spicedb",
65+
Subsystem: "perf_insights",
66+
Name: "api_shape_latency_seconds",
67+
Help: "The latency of API calls, by shape",
68+
Buckets: nil,
69+
NativeHistogramBucketFactor: 1.1,
70+
}, append([]string{"api_kind"}, allLabels...))
71+
72+
// ShapeBuilder is a function that returns a slice of strings representing the shape of the API call.
73+
// This is used to report the shape of the API call to Prometheus.
74+
type ShapeBuilder func() APIShapeLabels
75+
76+
// ObserveShapeLatency observes the latency of the API call with the given shape.
77+
func ObserveShapeLatency(ctx context.Context, methodName string, shape APIShapeLabels, duration time.Duration) {
78+
observeShapeLatency(ctx, APIShapeLatency, methodName, shape, duration)
79+
}
80+
81+
func observeShapeLatency(ctx context.Context, metric *prometheus.HistogramVec, methodName string, shape APIShapeLabels, duration time.Duration) {
82+
labels := buildLabels(methodName, shape)
83+
if len(labels) == 0 {
84+
log.Warn().Str("method", methodName).
85+
Msg("ShapeBuilder returned an empty shape, skipping reporting")
86+
return
87+
}
88+
89+
o := metric.WithLabelValues(labels...)
90+
if exemplarObserver, ok := o.(prometheus.ExemplarObserver); ok {
91+
if spanCtx := trace.SpanContextFromContext(ctx); spanCtx.HasTraceID() {
92+
traceID := prometheus.Labels{"traceID": spanCtx.TraceID().String()}
93+
exemplarObserver.ObserveWithExemplar(duration.Seconds(), traceID)
94+
return
95+
}
96+
}
97+
98+
o.Observe(duration.Seconds())
99+
}
100+
101+
func buildLabels(methodName string, shape APIShapeLabels) []string {
102+
labels := make([]string, len(allLabels)+1)
103+
labels[0] = methodName
104+
105+
for index, label := range allLabels {
106+
shapeValue, ok := shape[APIShapeLabel(label)]
107+
if !ok {
108+
continue
109+
}
110+
111+
switch t := shapeValue.(type) {
112+
case string:
113+
labels[index+1] = t
114+
115+
case int:
116+
labels[index+1] = strconv.Itoa(t)
117+
118+
case uint32:
119+
intValue, err := safecast.ToInt(t)
120+
if err != nil {
121+
log.Warn().Str("method", methodName).
122+
Str("key", label).
123+
Str("type", fmt.Sprintf("%T", t)).
124+
Msg("ShapeBuilder could not convert int, skipping reporting")
125+
return nil
126+
}
127+
labels[index+1] = strconv.Itoa(intValue)
128+
129+
case int64:
130+
intValue, err := safecast.ToInt(t)
131+
if err != nil {
132+
log.Warn().Str("method", methodName).
133+
Str("key", label).
134+
Str("type", fmt.Sprintf("%T", t)).
135+
Msg("ShapeBuilder could not convert int, skipping reporting")
136+
return nil
137+
}
138+
labels[index+1] = strconv.Itoa(intValue)
139+
140+
default:
141+
spiceerrors.DebugAssert(func() bool { return false }, "unsupported type %T", t)
142+
log.Warn().Str("method", methodName).
143+
Str("key", label).
144+
Str("type", fmt.Sprintf("%T", t)).
145+
Msg("ShapeBuilder returned an unsupported type, skipping reporting")
146+
return nil
147+
}
148+
}
149+
150+
return labels
151+
}
152+
153+
type reporter struct {
154+
isEnabled bool
155+
}
156+
157+
func (r *reporter) ServerReporter(ctx context.Context, callMeta interceptors.CallMeta) (interceptors.Reporter, context.Context) {
158+
if !r.isEnabled {
159+
return &interceptors.NoopReporter{}, ctx
160+
}
161+
162+
_, methodName := grpcutil.SplitMethodName(callMeta.FullMethod())
163+
ctx = contextWithHandle(ctx)
164+
return &serverReporter{ctx: ctx, methodName: methodName}, ctx
165+
}
166+
167+
type serverReporter struct {
168+
interceptors.NoopReporter
169+
ctx context.Context
170+
methodName string
171+
}
172+
173+
var ctxKey = ctxkey.NewBoxedWithDefault[ShapeBuilder](nil)
174+
175+
func (r *serverReporter) PostCall(err error, duration time.Duration) {
176+
// Do not report if the call resulted in an error.
177+
if err != nil {
178+
return
179+
}
180+
181+
shapeClosure := ctxKey.Value(r.ctx)
182+
spiceerrors.DebugAssertNotNil(shapeClosure, "ShapeBuilder should not be nil")
183+
184+
// If not testing and nil, simply skip.
185+
if shapeClosure == nil {
186+
log.Warn().Str("method", r.methodName).
187+
Msg("ShapeBuilder is nil, skipping reporting")
188+
return
189+
}
190+
191+
ObserveShapeLatency(r.ctx, r.methodName, shapeClosure(), duration)
192+
}
193+
194+
// UnaryServerInterceptor returns a gRPC server-side interceptor that provides reporting for Unary RPCs.
195+
func UnaryServerInterceptor(isEnabled bool) grpc.UnaryServerInterceptor {
196+
return interceptors.UnaryServerInterceptor(&reporter{isEnabled: isEnabled})
197+
}
198+
199+
// StreamServerInterceptor returns a gRPC server-side interceptor that provides reporting for Streaming RPCs.
200+
func StreamServerInterceptor(isEnabled bool) grpc.StreamServerInterceptor {
201+
return interceptors.StreamServerInterceptor(&reporter{isEnabled: isEnabled})
202+
}
203+
204+
// SetInContext sets the ShapeBuilder in the context.
205+
func SetInContext(ctx context.Context, builder ShapeBuilder) {
206+
ctxKey.Set(ctx, builder)
207+
}
208+
209+
// contextWithHandle returns a context with the ShapeBuilder set.
210+
// Should only be called by the server reporter or a test.
211+
func contextWithHandle(ctx context.Context) context.Context {
212+
return ctxKey.SetBox(ctx)
213+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
package perfinsights
2+
3+
import (
4+
"context"
5+
"strings"
6+
"testing"
7+
"time"
8+
9+
"github.com/prometheus/client_golang/prometheus"
10+
io_prometheus_client "github.com/prometheus/client_model/go"
11+
"github.com/stretchr/testify/require"
12+
)
13+
14+
func TestBuildLabels(t *testing.T) {
15+
tcs := []struct {
16+
name string
17+
shape APIShapeLabels
18+
expectedLabels []string
19+
}{
20+
{
21+
name: "empty shape",
22+
shape: APIShapeLabels{},
23+
expectedLabels: []string{"testMethod", "", "", "", "", "", ""},
24+
},
25+
{
26+
name: "full shape",
27+
shape: APIShapeLabels{
28+
ResourceTypeLabel: "resource_type",
29+
ResourceRelationLabel: "resource_relation",
30+
SubjectTypeLabel: "subject_type",
31+
SubjectRelationLabel: "subject_relation",
32+
NameLabel: "name",
33+
FilterLabel: "filter",
34+
},
35+
expectedLabels: []string{"testMethod", "resource_type", "resource_relation", "subject_type", "subject_relation", "name", "filter"},
36+
},
37+
{
38+
name: "full shape with integers",
39+
shape: APIShapeLabels{
40+
ResourceTypeLabel: "resource_type",
41+
ResourceRelationLabel: "resource_relation",
42+
SubjectTypeLabel: "subject_type",
43+
SubjectRelationLabel: int64(40),
44+
NameLabel: uint32(41),
45+
FilterLabel: 42,
46+
},
47+
expectedLabels: []string{"testMethod", "resource_type", "resource_relation", "subject_type", "40", "41", "42"},
48+
},
49+
}
50+
51+
for _, tc := range tcs {
52+
t.Run(tc.name, func(t *testing.T) {
53+
labels := buildLabels("testMethod", tc.shape)
54+
require.Equal(t, tc.expectedLabels, labels)
55+
})
56+
}
57+
}
58+
59+
func TestObserveShapeLatency(t *testing.T) {
60+
reg := prometheus.NewRegistry()
61+
62+
metric := prometheus.NewHistogramVec(prometheus.HistogramOpts{
63+
Namespace: "spicedb",
64+
Subsystem: "perf_insights",
65+
Name: "api_shape_latency_seconds",
66+
Help: "The latency of API calls, by shape",
67+
Buckets: nil,
68+
NativeHistogramBucketFactor: 1.1,
69+
}, append([]string{"api_kind"}, allLabels...))
70+
require.NoError(t, reg.Register(metric))
71+
72+
// Report some data.
73+
observeShapeLatency(context.Background(), metric, "testMethod", APIShapeLabels{
74+
ResourceTypeLabel: "resource_type",
75+
ResourceRelationLabel: "resource_relation",
76+
SubjectTypeLabel: 42,
77+
}, 100*time.Millisecond)
78+
79+
// Ensure it was added to the metric.
80+
metrics, err := reg.Gather()
81+
require.NoError(t, err)
82+
require.NotEmpty(t, metrics)
83+
84+
for _, metric := range metrics {
85+
if !strings.HasSuffix(metric.GetName(), "api_shape_latency_seconds") {
86+
continue
87+
}
88+
89+
require.Equal(t, io_prometheus_client.MetricType_HISTOGRAM, metric.GetType())
90+
91+
require.Equal(t, uint64(1), metric.GetMetric()[0].Histogram.GetSampleCount())
92+
require.Equal(t, float64(0.1), metric.GetMetric()[0].Histogram.GetSampleSum())
93+
require.Equal(t, "testMethod", metric.GetMetric()[0].Label[0].GetValue())
94+
95+
for _, label := range metric.GetMetric()[0].Label {
96+
if label.GetName() == "api_kind" {
97+
require.Equal(t, "testMethod", label.GetValue())
98+
continue
99+
}
100+
101+
if label.GetName() == "resource_type" {
102+
require.Equal(t, "resource_type", label.GetValue())
103+
continue
104+
}
105+
106+
if label.GetName() == "resource_relation" {
107+
require.Equal(t, "resource_relation", label.GetValue())
108+
continue
109+
}
110+
111+
if label.GetName() == "subject_type" {
112+
require.Equal(t, "42", label.GetValue())
113+
continue
114+
}
115+
116+
require.Empty(t, label.GetValue())
117+
}
118+
}
119+
}

internal/services/server.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,13 @@ func RegisterGrpcServices(
6969
}
7070

7171
if schemaServiceOption == V1SchemaServiceEnabled || schemaServiceOption == V1SchemaServiceAdditiveOnly {
72-
v1.RegisterSchemaServiceServer(srv, v1svc.NewSchemaServer(permSysConfig.CaveatTypeSet, schemaServiceOption == V1SchemaServiceAdditiveOnly, permSysConfig.ExpiringRelationshipsEnabled))
72+
schemaConfig := v1svc.SchemaServerConfig{
73+
CaveatTypeSet: permSysConfig.CaveatTypeSet,
74+
AdditiveOnly: schemaServiceOption == V1SchemaServiceAdditiveOnly,
75+
ExpiringRelsEnabled: permSysConfig.ExpiringRelationshipsEnabled,
76+
PerformanceInsightMetricsEnabled: permSysConfig.PerformanceInsightMetricsEnabled,
77+
}
78+
v1.RegisterSchemaServiceServer(srv, v1svc.NewSchemaServer(schemaConfig))
7379
healthManager.RegisterReportedService(v1.SchemaService_ServiceDesc.ServiceName)
7480
}
7581

0 commit comments

Comments
 (0)