Skip to content

Commit 255658f

Browse files
feat(metrics): cache write cost accounting in /metrics (#57)
Closes #52 Add 9 Prometheus metrics for Anthropic prompt cache token usage and cache boundary state. RecordCacheUsage accepts the Anthropic usage block directly; RecordCacheBoundary is called by the session boundary manager after each evaluation. Co-authored-by: Ona <no-reply@ona.com>
1 parent 4d44256 commit 255658f

3 files changed

Lines changed: 286 additions & 0 deletions

File tree

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,8 @@ Distill exposes a Prometheus-compatible `/metrics` endpoint on both `api` and `s
560560

561561
### Metrics
562562

563+
**Pipeline metrics**
564+
563565
| Metric | Type | Description |
564566
|--------|------|-------------|
565567
| `distill_requests_total` | Counter | Total requests by endpoint and status code |
@@ -569,6 +571,27 @@ Distill exposes a Prometheus-compatible `/metrics` endpoint on both `api` and `s
569571
| `distill_active_requests` | Gauge | Currently processing requests |
570572
| `distill_clusters_formed_total` | Counter | Clusters formed during deduplication |
571573

574+
**Cache cost metrics**
575+
576+
Record Anthropic API usage with `metrics.RecordCacheUsage(UsageRecord{...})` after each API call to track prompt cache efficiency:
577+
578+
| Metric | Type | Description |
579+
|--------|------|-------------|
580+
| `distill_cache_creation_tokens_total` | Counter | Tokens written to Anthropic cache (charged at 1.25× input price) |
581+
| `distill_cache_read_tokens_total` | Counter | Tokens read from Anthropic cache (charged at 0.10× input price) |
582+
| `distill_uncached_input_tokens_total` | Counter | Uncached input tokens (charged at 1.00×) |
583+
| `distill_cache_hit_rate` | Gauge | Rolling hit rate: `cache_read / (cache_read + cache_creation + input)` |
584+
| `distill_cache_write_efficiency` | Gauge | Reads/writes ratio — values < 1.0 mean cache writes that expire before being read |
585+
586+
**Cache boundary metrics** (populated by the session boundary manager)
587+
588+
| Metric | Type | Description |
589+
|--------|------|-------------|
590+
| `distill_cache_boundary_position_tokens` | Gauge | Current boundary position in tokens per session |
591+
| `distill_cache_boundary_advances_total` | Counter | Times the boundary moved forward (more content became stable) |
592+
| `distill_cache_boundary_retreats_total` | Counter | Times the boundary retreated (content changed or was evicted) |
593+
| `distill_cache_estimated_savings_tokens_total` | Counter | Estimated tokens saved by prompt caching |
594+
572595
### Prometheus Scrape Config
573596

574597
```yaml
@@ -763,6 +786,7 @@ Distill is evolving from a dedup utility into a context intelligence layer. Here
763786
| **Session Management** | [#31](https://github.com/Siddhant-K-code/distill/issues/31) | Shipped | Stateful context windows with token budgets, hierarchical compression, and importance-based eviction. See [Session Management](#session-management). |
764787
| **PatternDetector cache_control annotations** | [#53](https://github.com/Siddhant-K-code/distill/issues/53) | Shipped | `PatternDetector` emits `CacheAnnotation` per chunk and `AnnotateChunksForCache` produces a `CacheControlPlan` with up to 4 Anthropic-compatible markers. |
765788
| **Session-aware cache boundary manager** | [#51](https://github.com/Siddhant-K-code/distill/issues/51) | Shipped | Auto-advances `cache_control` placement as sessions grow. Stable entries (present ≥ 2 turns unmodified) are included in the cached prefix; boundary retreats when content changes. |
789+
| **Cache write cost accounting** | [#52](https://github.com/Siddhant-K-code/distill/issues/52) | Shipped | 9 new Prometheus metrics covering Anthropic prompt cache token usage, hit rate, write efficiency, and boundary position. Feed API response usage via `RecordCacheUsage`. |
766790
767791
### Code Intelligence
768792

pkg/metrics/metrics.go

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,20 @@ type Metrics struct {
2020
ActiveRequests prometheus.Gauge
2121
ClustersFormed *prometheus.CounterVec
2222

23+
// Cache cost accounting (issue #52).
24+
// These track Anthropic API usage fields returned in every response.
25+
CacheCreationTokens *prometheus.CounterVec
26+
CacheReadTokens *prometheus.CounterVec
27+
UncachedInputTokens *prometheus.CounterVec
28+
CacheHitRate prometheus.Gauge
29+
CacheWriteEfficiency prometheus.Gauge
30+
31+
// Cache boundary metrics (issue #51).
32+
CacheBoundaryPosition *prometheus.GaugeVec
33+
CacheBoundaryAdvances *prometheus.CounterVec
34+
CacheBoundaryRetreats *prometheus.CounterVec
35+
CacheEstimatedSavings *prometheus.CounterVec
36+
2337
registry *prometheus.Registry
2438
}
2539

@@ -75,6 +89,72 @@ func New() *Metrics {
7589
},
7690
[]string{"endpoint"},
7791
),
92+
93+
// Cache cost accounting.
94+
CacheCreationTokens: prometheus.NewCounterVec(
95+
prometheus.CounterOpts{
96+
Name: "distill_cache_creation_tokens_total",
97+
Help: "Tokens written to Anthropic prompt cache (charged at 1.25x input price).",
98+
},
99+
[]string{"session_id"},
100+
),
101+
CacheReadTokens: prometheus.NewCounterVec(
102+
prometheus.CounterOpts{
103+
Name: "distill_cache_read_tokens_total",
104+
Help: "Tokens read from Anthropic prompt cache (charged at 0.10x input price).",
105+
},
106+
[]string{"session_id"},
107+
),
108+
UncachedInputTokens: prometheus.NewCounterVec(
109+
prometheus.CounterOpts{
110+
Name: "distill_uncached_input_tokens_total",
111+
Help: "Input tokens not served from cache (charged at 1.00x input price).",
112+
},
113+
[]string{"session_id"},
114+
),
115+
CacheHitRate: prometheus.NewGauge(
116+
prometheus.GaugeOpts{
117+
Name: "distill_cache_hit_rate",
118+
Help: "Rolling cache hit rate: cache_read / (cache_read + cache_creation + input).",
119+
},
120+
),
121+
CacheWriteEfficiency: prometheus.NewGauge(
122+
prometheus.GaugeOpts{
123+
Name: "distill_cache_write_efficiency",
124+
Help: "Cache read/write ratio. Values < 1.0 indicate writes that expire before being read.",
125+
},
126+
),
127+
128+
// Cache boundary metrics.
129+
CacheBoundaryPosition: prometheus.NewGaugeVec(
130+
prometheus.GaugeOpts{
131+
Name: "distill_cache_boundary_position_tokens",
132+
Help: "Current cache boundary position in tokens for a session.",
133+
},
134+
[]string{"session_id"},
135+
),
136+
CacheBoundaryAdvances: prometheus.NewCounterVec(
137+
prometheus.CounterOpts{
138+
Name: "distill_cache_boundary_advances_total",
139+
Help: "Number of times the cache boundary advanced (more content became stable).",
140+
},
141+
[]string{"session_id"},
142+
),
143+
CacheBoundaryRetreats: prometheus.NewCounterVec(
144+
prometheus.CounterOpts{
145+
Name: "distill_cache_boundary_retreats_total",
146+
Help: "Number of times the cache boundary retreated (content changed or was evicted).",
147+
},
148+
[]string{"session_id"},
149+
),
150+
CacheEstimatedSavings: prometheus.NewCounterVec(
151+
prometheus.CounterOpts{
152+
Name: "distill_cache_estimated_savings_tokens_total",
153+
Help: "Estimated tokens saved by prompt caching across all sessions.",
154+
},
155+
[]string{"session_id"},
156+
),
157+
78158
registry: reg,
79159
}
80160

@@ -85,6 +165,15 @@ func New() *Metrics {
85165
m.ReductionRatio,
86166
m.ActiveRequests,
87167
m.ClustersFormed,
168+
m.CacheCreationTokens,
169+
m.CacheReadTokens,
170+
m.UncachedInputTokens,
171+
m.CacheHitRate,
172+
m.CacheWriteEfficiency,
173+
m.CacheBoundaryPosition,
174+
m.CacheBoundaryAdvances,
175+
m.CacheBoundaryRetreats,
176+
m.CacheEstimatedSavings,
88177
)
89178

90179
return m
@@ -114,6 +203,64 @@ func (m *Metrics) RecordDedup(endpoint string, inputCount, outputCount, clusterC
114203
}
115204
}
116205

206+
// UsageRecord holds the token counts returned by the Anthropic API in the
207+
// usage block of every response. Pass this to RecordCacheUsage after each
208+
// API call to keep the cache cost metrics up to date.
209+
type UsageRecord struct {
210+
// SessionID is optional; use "" for non-session requests.
211+
SessionID string
212+
213+
InputTokens int
214+
CacheCreationInputTokens int
215+
CacheReadInputTokens int
216+
OutputTokens int
217+
}
218+
219+
// RecordCacheUsage records Anthropic API usage fields and updates the derived
220+
// cache hit rate and write efficiency gauges.
221+
func (m *Metrics) RecordCacheUsage(u UsageRecord) {
222+
sid := u.SessionID
223+
if sid == "" {
224+
sid = "default"
225+
}
226+
227+
if u.CacheCreationInputTokens > 0 {
228+
m.CacheCreationTokens.WithLabelValues(sid).Add(float64(u.CacheCreationInputTokens))
229+
}
230+
if u.CacheReadInputTokens > 0 {
231+
m.CacheReadTokens.WithLabelValues(sid).Add(float64(u.CacheReadInputTokens))
232+
}
233+
if u.InputTokens > 0 {
234+
m.UncachedInputTokens.WithLabelValues(sid).Add(float64(u.InputTokens))
235+
}
236+
237+
// Update derived gauges using the values from this single request.
238+
total := float64(u.InputTokens + u.CacheCreationInputTokens + u.CacheReadInputTokens)
239+
if total > 0 {
240+
hitRate := float64(u.CacheReadInputTokens) / total
241+
m.CacheHitRate.Set(hitRate)
242+
}
243+
244+
if u.CacheCreationInputTokens > 0 {
245+
efficiency := float64(u.CacheReadInputTokens) / float64(u.CacheCreationInputTokens)
246+
m.CacheWriteEfficiency.Set(efficiency)
247+
}
248+
}
249+
250+
// RecordCacheBoundary records a cache boundary evaluation result for a session.
251+
func (m *Metrics) RecordCacheBoundary(sessionID string, boundaryTokens int, advanced, retreated bool) {
252+
if sessionID == "" {
253+
sessionID = "default"
254+
}
255+
m.CacheBoundaryPosition.WithLabelValues(sessionID).Set(float64(boundaryTokens))
256+
if advanced {
257+
m.CacheBoundaryAdvances.WithLabelValues(sessionID).Inc()
258+
}
259+
if retreated {
260+
m.CacheBoundaryRetreats.WithLabelValues(sessionID).Inc()
261+
}
262+
}
263+
117264
// Middleware returns an HTTP middleware that instruments requests.
118265
func (m *Metrics) Middleware(endpoint string, next http.HandlerFunc) http.HandlerFunc {
119266
return func(w http.ResponseWriter, r *http.Request) {

pkg/metrics/metrics_test.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,121 @@ func TestActiveRequests(t *testing.T) {
159159
close(release)
160160
}
161161

162+
func TestRecordCacheUsage(t *testing.T) {
163+
m := New()
164+
165+
m.RecordCacheUsage(UsageRecord{
166+
SessionID: "sess-1",
167+
InputTokens: 100,
168+
CacheCreationInputTokens: 8000,
169+
CacheReadInputTokens: 0,
170+
OutputTokens: 200,
171+
})
172+
173+
creationVal := counterValue(t, m.CacheCreationTokens, "session_id", "sess-1")
174+
if creationVal != 8000 {
175+
t.Errorf("expected 8000 cache creation tokens, got %f", creationVal)
176+
}
177+
178+
uncachedVal := counterValue(t, m.UncachedInputTokens, "session_id", "sess-1")
179+
if uncachedVal != 100 {
180+
t.Errorf("expected 100 uncached input tokens, got %f", uncachedVal)
181+
}
182+
183+
// Second call: now we have cache reads.
184+
m.RecordCacheUsage(UsageRecord{
185+
SessionID: "sess-1",
186+
InputTokens: 0,
187+
CacheCreationInputTokens: 0,
188+
CacheReadInputTokens: 8000,
189+
OutputTokens: 200,
190+
})
191+
192+
readVal := counterValue(t, m.CacheReadTokens, "session_id", "sess-1")
193+
if readVal != 8000 {
194+
t.Errorf("expected 8000 cache read tokens, got %f", readVal)
195+
}
196+
197+
// Hit rate should be 1.0 (all tokens from cache on second call).
198+
var hitRateMetric dto.Metric
199+
if err := m.CacheHitRate.Write(&hitRateMetric); err != nil {
200+
t.Fatalf("read CacheHitRate: %v", err)
201+
}
202+
if hitRateMetric.GetGauge().GetValue() != 1.0 {
203+
t.Errorf("expected hit rate 1.0, got %f", hitRateMetric.GetGauge().GetValue())
204+
}
205+
}
206+
207+
func TestRecordCacheUsage_DefaultSessionID(t *testing.T) {
208+
m := New()
209+
// Should not panic with empty session ID.
210+
m.RecordCacheUsage(UsageRecord{
211+
InputTokens: 50,
212+
CacheCreationInputTokens: 1000,
213+
})
214+
val := counterValue(t, m.CacheCreationTokens, "session_id", "default")
215+
if val != 1000 {
216+
t.Errorf("expected 1000, got %f", val)
217+
}
218+
}
219+
220+
func TestRecordCacheBoundary(t *testing.T) {
221+
m := New()
222+
223+
m.RecordCacheBoundary("sess-1", 8192, true, false)
224+
m.RecordCacheBoundary("sess-1", 16384, true, false)
225+
m.RecordCacheBoundary("sess-1", 8192, false, true)
226+
227+
advVal := counterValue(t, m.CacheBoundaryAdvances, "session_id", "sess-1")
228+
if advVal != 2 {
229+
t.Errorf("expected 2 advances, got %f", advVal)
230+
}
231+
232+
retVal := counterValue(t, m.CacheBoundaryRetreats, "session_id", "sess-1")
233+
if retVal != 1 {
234+
t.Errorf("expected 1 retreat, got %f", retVal)
235+
}
236+
237+
var posMetric dto.Metric
238+
pos, err := m.CacheBoundaryPosition.GetMetricWithLabelValues("sess-1")
239+
if err != nil {
240+
t.Fatalf("get boundary position: %v", err)
241+
}
242+
if err := pos.Write(&posMetric); err != nil {
243+
t.Fatalf("read boundary position: %v", err)
244+
}
245+
if posMetric.GetGauge().GetValue() != 8192 {
246+
t.Errorf("expected boundary 8192, got %f", posMetric.GetGauge().GetValue())
247+
}
248+
}
249+
250+
func TestHandler_CacheMetrics(t *testing.T) {
251+
m := New()
252+
m.RecordCacheUsage(UsageRecord{
253+
SessionID: "sess-1",
254+
CacheCreationInputTokens: 4096,
255+
CacheReadInputTokens: 4096,
256+
})
257+
m.RecordCacheBoundary("sess-1", 8192, true, false)
258+
259+
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
260+
rec := httptest.NewRecorder()
261+
m.Handler().ServeHTTP(rec, req)
262+
263+
body := rec.Body.String()
264+
for _, metric := range []string{
265+
"distill_cache_creation_tokens_total",
266+
"distill_cache_read_tokens_total",
267+
"distill_cache_hit_rate",
268+
"distill_cache_write_efficiency",
269+
"distill_cache_boundary_position_tokens",
270+
} {
271+
if !strings.Contains(body, metric) {
272+
t.Errorf("metrics output missing %s", metric)
273+
}
274+
}
275+
}
276+
162277
// counterValue extracts the value of a counter with the given label pairs.
163278
func counterValue(t *testing.T, cv *prometheus.CounterVec, labelPairs ...string) float64 {
164279
t.Helper()

0 commit comments

Comments
 (0)