Skip to content

Commit b3427d0

Browse files
willh-dbyuchen-db
authored andcommitted
Add metrics for tenant attribution and write requests (#300)
1 parent 0ade67f commit b3427d0

File tree

1 file changed

+44
-1
lines changed

1 file changed

+44
-1
lines changed

pkg/receive/handler.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,10 @@ type Handler struct {
213213
writeTimeseriesError *prometheus.HistogramVec
214214
writeE2eLatency *prometheus.HistogramVec
215215

216+
writeRequestsTotal *prometheus.CounterVec
217+
writeRejectedTotal *prometheus.CounterVec
218+
tenantAttributedTotal *prometheus.CounterVec
219+
216220
Limiter *Limiter
217221
}
218222

@@ -318,6 +322,30 @@ func NewHandler(logger log.Logger, o *Options) *Handler {
318322
Buckets: []float64{1, 5, 10, 20, 30, 40, 50, 60, 90, 120, 300, 600, 900, 1200, 1800, 3600},
319323
}, []string{"code", "tenant", "rollup"},
320324
),
325+
writeRequestsTotal: promauto.With(registerer).NewCounterVec(
326+
prometheus.CounterOpts{
327+
Namespace: "thanos",
328+
Subsystem: "receive",
329+
Name: "write_requests_total",
330+
Help: "The total number of write requests by tenant and response code.",
331+
}, []string{"code", "tenant"},
332+
),
333+
writeRejectedTotal: promauto.With(registerer).NewCounterVec(
334+
prometheus.CounterOpts{
335+
Namespace: "thanos",
336+
Subsystem: "receive",
337+
Name: "write_rejected_total",
338+
Help: "The total number of write requests rejected by reason and tenant.",
339+
}, []string{"reason", "tenant"},
340+
),
341+
tenantAttributedTotal: promauto.With(registerer).NewCounterVec(
342+
prometheus.CounterOpts{
343+
Namespace: "thanos",
344+
Subsystem: "receive",
345+
Name: "tenant_attributed_total",
346+
Help: "The total number of time series attributed to each tenant by source.",
347+
}, []string{"tenant", "source"},
348+
),
321349
}
322350

323351
h.forwardRequests.WithLabelValues(labelSuccess)
@@ -546,17 +574,22 @@ func (h *Handler) tenantKeyForDistribution(tenantHTTP string, ts prompb.TimeSeri
546574
lbls := labelpb.ZLabelsToPromLabels(ts.Labels)
547575
attributedTenant := h.options.TenantAttributor.GetTenantFromLabels(lbls)
548576
h.options.TenantAttributor.RecordVerification(attributedTenant, tenantHTTP)
577+
// Track what tenant would be attributed (for monitoring attribution rules)
578+
h.tenantAttributedTotal.WithLabelValues(attributedTenant, "label_rules").Inc()
549579
return tenantHTTP
550580
}
551581

552582
// Non-verify mode: if HTTP header was provided (tenant != default), use it.
553583
if tenantHTTP != h.options.DefaultTenantID {
584+
h.tenantAttributedTotal.WithLabelValues(tenantHTTP, "http_header").Inc()
554585
return tenantHTTP
555586
}
556587

557588
// No HTTP header provided: do attribution from labels.
558589
lbls := labelpb.ZLabelsToPromLabels(ts.Labels)
559-
return h.options.TenantAttributor.GetTenantFromLabels(lbls)
590+
attributedTenant := h.options.TenantAttributor.GetTenantFromLabels(lbls)
591+
h.tenantAttributedTotal.WithLabelValues(attributedTenant, "label_rules").Inc()
592+
return attributedTenant
560593
}
561594

562595
// Legacy behavior: use splitTenantLabelName if configured.
@@ -674,6 +707,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
674707
defer writeGate.Done()
675708
if err != nil {
676709
level.Error(tLogger).Log("err", err, "msg", "internal server error")
710+
h.writeRejectedTotal.WithLabelValues("write_gate", tenantHTTP).Inc()
677711
http.Error(w, err.Error(), http.StatusInternalServerError)
678712
return
679713
}
@@ -685,6 +719,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
685719

686720
// Fail request fully if tenant has exceeded set limit.
687721
if !under {
722+
h.writeRejectedTotal.WithLabelValues("head_series_limit", tenantHTTP).Inc()
688723
http.Error(w, "tenant is above active series limit", http.StatusTooManyRequests)
689724
return
690725
}
@@ -703,6 +738,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
703738

704739
if r.ContentLength >= 0 {
705740
if !requestLimiter.AllowSizeBytes(tenantHTTP, r.ContentLength) {
741+
h.writeRejectedTotal.WithLabelValues("request_size", tenantHTTP).Inc()
706742
http.Error(w, errRequestTooLarge.Error(), http.StatusRequestEntityTooLarge)
707743
return
708744
}
@@ -722,6 +758,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
722758
_, err = io.CopyBuffer(lw, r.Body, *copyBuf)
723759
if err != nil {
724760
if err == errRequestTooLarge {
761+
h.writeRejectedTotal.WithLabelValues("request_size", tenantHTTP).Inc()
725762
http.Error(w, errRequestTooLarge.Error(), http.StatusRequestEntityTooLarge)
726763
return
727764
}
@@ -745,6 +782,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
745782
}
746783

747784
if !requestLimiter.AllowSizeBytes(tenantHTTP, int64(len(*reqBuf))) {
785+
h.writeRejectedTotal.WithLabelValues("request_size", tenantHTTP).Inc()
748786
http.Error(w, errRequestTooLarge.Error(), http.StatusRequestEntityTooLarge)
749787
return
750788
}
@@ -786,6 +824,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
786824
}
787825

788826
if !requestLimiter.AllowSeries(tenantHTTP, int64(len(wreq.Timeseries))) {
827+
h.writeRejectedTotal.WithLabelValues("series_limit", tenantHTTP).Inc()
789828
http.Error(w, "too many timeseries", http.StatusRequestEntityTooLarge)
790829
return
791830
}
@@ -795,6 +834,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
795834
totalSamples += len(timeseries.Samples)
796835
}
797836
if !requestLimiter.AllowSamples(tenantHTTP, int64(totalSamples)) {
837+
h.writeRejectedTotal.WithLabelValues("samples_limit", tenantHTTP).Inc()
798838
http.Error(w, "too many samples", http.StatusRequestEntityTooLarge)
799839
return
800840
}
@@ -837,6 +877,9 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
837877
http.Error(w, err.Error(), responseStatusCode)
838878
}
839879

880+
// Track write requests by tenant and response code
881+
h.writeRequestsTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenantHTTP).Inc()
882+
840883
for tenant, stats := range tenantStats {
841884
h.writeTimeseriesTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenant).Observe(float64(stats.timeseries))
842885
h.writeSamplesTotal.WithLabelValues(strconv.Itoa(responseStatusCode), tenant).Observe(float64(stats.totalSamples))

0 commit comments

Comments
 (0)