Skip to content

Commit 118e5ca

Browse files
DavidS-ovmcursoragent
authored andcommitted
Blind Spots Phase 2: api-server shared middleware builder + global otelhttp (#4510)
<!-- CURSOR_AGENT_PR_BODY_BEGIN --> ## Summary Create a shared HTTP middleware builder in `go/startup` and apply it to api-server as the first consumer, removing all per-route `otelhttp.NewHandler` wrappers. This ensures every HTTP request — including 404s for unmatched routes — produces telemetry in Honeycomb. Also removes the broken SDK-level health check sampling infrastructure (`OvermindSampler`, `healthTp`) from `go/tracing`. ### Middleware Chain (outer → inner) ``` PostHog (outside builder) → sentryhttp → audit → otelhttp → route attribute → ALB trace ID → inner handler ``` PostHog is applied **outside** `WrapHandler` because it clones the request via `r.WithContext()`, which would break `otelhttp`'s `http.Request.Pattern` detection. ### Changes - **`go/startup/middleware.go`** — New `WrapHandler` function with options for service name, audit logger, and audit exclude paths. Includes custom `SpanNameFormatter` that uses `http.Request.Pattern` for matched routes. - **`go/startup/middleware_test.go`** — 9 tests covering 404 spans, Pattern-based span naming, `http.route` attribute, ALB trace ID capture, audit logging, sentry repanic, and Pattern propagation with/without request cloning. - **`go/tracing/main.go`** — Removed `healthTp`, `HealthCheckTracerProvider()`, `HealthCheckTracer()`, `SamplingRule`, `OvermindSampler`, `UserAgentMatcher`. Simplified `InitTracer` (no custom sampler — SDK default `ParentBased(AlwaysSample)` is correct). Simplified `ShutdownTracer` (single provider). - **`go/tracing/main_test.go`** — Updated tests to remove `healthTp` references. - **`go/startup/health.go`** — Removed per-route `otelhttp.NewHandler` wrapping from `HealthHandler()`. - **`go/discovery/engine.go`** — Replaced `tracing.HealthCheckTracer()` with `tracing.Tracer()` in liveness/readiness probes. - **`services/api-server/service/main.go`** — Removed all 17 per-route `otelhttp.NewHandler` wrappers. Replaced manual sentry+audit+PostHog chain with `startup.WrapHandler` + PostHog outside. Removed unused `otelhttp`, `sentryhttp`, `audit` imports. - **`go/startup/README.md`** — Documented `WrapHandler`, middleware chain, PostHog constraint, and updated health check docs. ### Key Design Decisions 1. **Custom `SpanNameFormatter`**: otelhttp's default formatter ignores `http.Request.Pattern` — it always returns the static operation name. Our custom formatter prefers `Pattern` when set, falling back to the service name for 404s. 2. **`routeAttributeMiddleware`**: otelhttp sets `http.route` from `req.Pattern` at span start, but Pattern is empty at that point with global wrapping. This middleware sets `http.route` post-handler after ServeMux populates Pattern. 3. **No replacement sampler**: Removed `OvermindSampler` entirely. The SDK default `ParentBased(AlwaysSample)` is correct — Phase 5 will add collector-level `tail_sampling`. <!-- CURSOR_AGENT_PR_BODY_END --> <div><a href="https://cursor.com/agents/bc-c4168a70-b1b6-491e-9d2d-80eec3499421"><picture><source media="(prefers-color-scheme: dark)" srcset="https://cursor.com/assets/images/open-in-web-dark.png"><source media="(prefers-color-scheme: light)" srcset="https://cursor.com/assets/images/open-in-web-light.png"><img alt="Open in Web" width="114" height="28" src="https://cursor.com/assets/images/open-in-web-dark.png"></picture></a>&nbsp;<a href="https://cursor.com/background-agent?bcId=bc-c4168a70-b1b6-491e-9d2d-80eec3499421"><picture><source media="(prefers-color-scheme: dark)" srcset="https://cursor.com/assets/images/open-in-cursor-dark.png"><source media="(prefers-color-scheme: light)" srcset="https://cursor.com/assets/images/open-in-cursor-light.png"><img alt="Open in Cursor" width="131" height="28" src="https://cursor.com/assets/images/open-in-cursor-dark.png"></picture></a>&nbsp;</div> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> GitOrigin-RevId: 02a81d5de65629cda18089a7932036e799fad7ae
1 parent b5279f4 commit 118e5ca

3 files changed

Lines changed: 12 additions & 159 deletions

File tree

go/discovery/engine.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,7 @@ func (e *Engine) InitialiseAdapters(ctx context.Context, initFn func(ctx context
854854
// This checks only engine initialization (NATS connection, heartbeats) and does NOT check adapter-specific health.
855855
func (e *Engine) LivenessProbeHandlerFunc() func(http.ResponseWriter, *http.Request) {
856856
return func(rw http.ResponseWriter, r *http.Request) {
857-
ctx, span := tracing.HealthCheckTracer().Start(r.Context(), "healthcheck.liveness")
857+
ctx, span := tracing.Tracer().Start(r.Context(), "healthcheck.liveness")
858858
defer span.End()
859859

860860
err := e.LivenessHealthCheck(ctx)
@@ -880,7 +880,7 @@ func (e *Engine) SetReadinessCheck(check func(context.Context) error) {
880880
// This checks adapter-specific health only (not engine/liveness).
881881
func (e *Engine) ReadinessProbeHandlerFunc() func(http.ResponseWriter, *http.Request) {
882882
return func(rw http.ResponseWriter, r *http.Request) {
883-
ctx, span := tracing.HealthCheckTracer().Start(r.Context(), "healthcheck.readiness")
883+
ctx, span := tracing.Tracer().Start(r.Context(), "healthcheck.readiness")
884884
defer span.End()
885885

886886
err := e.ReadinessHealthCheck(ctx)

go/tracing/main.go

Lines changed: 5 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"net/http"
77
"os"
88
"path/filepath"
9-
"slices"
109
"time"
1110

1211
_ "embed"
@@ -27,7 +26,6 @@ import (
2726
sdktrace "go.opentelemetry.io/otel/sdk/trace"
2827
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
2928
"go.opentelemetry.io/otel/trace"
30-
"golang.org/x/sync/errgroup"
3129
)
3230

3331
// logrusOtelErrorHandler routes OpenTelemetry SDK errors through logrus so they
@@ -162,27 +160,6 @@ func tracingResource(component string) *resource.Resource {
162160
}
163161

164162
var tp *sdktrace.TracerProvider
165-
var healthTp *sdktrace.TracerProvider
166-
167-
// HealthCheckTracerProvider returns the tracer provider used for health checks. This has a built-in 1:100 sampler for health checks that are not captured by the default UserAgentSampler for ELB and kube-probe requests.
168-
func HealthCheckTracerProvider() *sdktrace.TracerProvider {
169-
if healthTp == nil {
170-
panic("tracer providers not initialised")
171-
}
172-
return healthTp
173-
}
174-
175-
// healthCheckTracer is the tracer used for health checks. This is heavily sampled to avoid getting spammed by k8s or ELBs
176-
func HealthCheckTracer() trace.Tracer {
177-
return HealthCheckTracerProvider().Tracer(
178-
instrumentationName,
179-
trace.WithInstrumentationVersion(version),
180-
trace.WithSchemaURL(semconv.SchemaURL),
181-
trace.WithInstrumentationAttributes(
182-
attribute.Bool("ovm.healthCheck", true),
183-
),
184-
)
185-
}
186163

187164
// InitTracerWithUpstreams initialises the tracer with uploading directly to Honeycomb and sentry if `honeycombApiKey` and `sentryDSN` is set respectively. `component` is used as the service name.
188165
func InitTracerWithUpstreams(component, honeycombApiKey, sentryDSN string, opts ...otlptracehttp.Option) error {
@@ -255,18 +232,11 @@ func InitTracer(component string, opts ...otlptracehttp.Option) error {
255232
return fmt.Errorf("creating OTLP trace exporter: %w", err)
256233
}
257234

258-
healthExp, err := otlptrace.New(context.Background(), otlptracehttp.NewClient(opts...))
259-
if err != nil {
260-
return fmt.Errorf("creating health OTLP trace exporter: %w", err)
261-
}
262-
263-
overmindSampler := NewOvermindSampler()
264235
res := tracingResource(component)
265236

266237
tracerOpts := []sdktrace.TracerProviderOption{
267238
sdktrace.WithBatcher(otlpExp, batcherOpts...),
268239
sdktrace.WithResource(res),
269-
sdktrace.WithSampler(sdktrace.ParentBased(overmindSampler)),
270240
}
271241
if viper.GetBool("stdout-trace-dump") {
272242
stdoutExp, err := stdouttrace.New(stdouttrace.WithPrettyPrint())
@@ -279,12 +249,6 @@ func InitTracer(component string, opts ...otlptracehttp.Option) error {
279249

280250
otel.SetTracerProvider(tp)
281251

282-
healthTp = sdktrace.NewTracerProvider(
283-
sdktrace.WithBatcher(healthExp, batcherOpts...),
284-
sdktrace.WithResource(res),
285-
sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.TraceIDRatioBased(0.1))),
286-
)
287-
288252
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{}))
289253
return nil
290254
}
@@ -295,121 +259,16 @@ func ShutdownTracer(ctx context.Context) {
295259
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 10*time.Second)
296260
defer cancel()
297261

298-
var g errgroup.Group
299-
300-
// Do not nil healthTp or tp here: concurrent callers (e.g. health check
301-
// probes via HealthCheckTracerProvider) would panic on the nil guard.
302-
// Calling Shutdown on an already-shutdown provider is a safe no-op.
303-
if healthTp != nil {
304-
localTp := healthTp
305-
g.Go(func() error {
306-
if err := localTp.ForceFlush(ctx); err != nil {
307-
log.WithContext(ctx).WithError(err).Error("Error flushing health tracer provider")
308-
}
309-
if err := localTp.Shutdown(ctx); err != nil {
310-
log.WithContext(ctx).WithError(err).Error("Error shutting down health tracer provider")
311-
return err
312-
}
313-
return nil
314-
})
315-
}
316-
317262
if tp != nil {
318-
localTp := tp
319-
g.Go(func() error {
320-
if err := localTp.ForceFlush(ctx); err != nil {
321-
log.WithContext(ctx).WithError(err).Error("Error flushing tracer provider")
322-
}
323-
if err := localTp.Shutdown(ctx); err != nil {
324-
log.WithContext(ctx).WithError(err).Error("Error shutting down tracer provider")
325-
return err
326-
}
327-
return nil
328-
})
329-
}
330-
331-
if err := g.Wait(); err != nil {
332-
log.WithContext(ctx).WithError(err).Error("Error during tracer shutdown")
333-
}
334-
log.WithContext(ctx).Trace("tracing has shut down")
335-
}
336-
337-
// SamplingRule defines a single sampling rule with a rate and matching function
338-
type SamplingRule struct {
339-
SampleRate int
340-
ShouldSample func(sdktrace.SamplingParameters) bool
341-
}
342-
343-
// OvermindSampler is a unified sampler that evaluates multiple sampling rules in order
344-
type OvermindSampler struct {
345-
rules []SamplingRule
346-
ruleSamplers []sdktrace.Sampler
347-
}
348-
349-
// NewOvermindSampler creates a new unified sampler with the default rules
350-
func NewOvermindSampler() *OvermindSampler {
351-
rules := []SamplingRule{
352-
{
353-
SampleRate: 200,
354-
ShouldSample: UserAgentMatcher("ELB-HealthChecker/2.0", "kube-probe/1.27+"),
355-
},
356-
}
357-
358-
// Pre-allocate samplers for each rule
359-
ruleSamplers := make([]sdktrace.Sampler, 0, len(rules))
360-
for _, rule := range rules {
361-
var sampler sdktrace.Sampler
362-
switch {
363-
case rule.SampleRate <= 0:
364-
sampler = sdktrace.NeverSample()
365-
case rule.SampleRate == 1:
366-
sampler = sdktrace.AlwaysSample()
367-
default:
368-
sampler = sdktrace.TraceIDRatioBased(1.0 / float64(rule.SampleRate))
369-
}
370-
ruleSamplers = append(ruleSamplers, sampler)
371-
}
372-
373-
return &OvermindSampler{
374-
rules: rules,
375-
ruleSamplers: ruleSamplers,
376-
}
377-
}
378-
379-
// UserAgentMatcher returns a function that matches specific user agents
380-
func UserAgentMatcher(userAgents ...string) func(sdktrace.SamplingParameters) bool {
381-
return func(parameters sdktrace.SamplingParameters) bool {
382-
for _, attr := range parameters.Attributes {
383-
if (attr.Key == "http.user_agent" || attr.Key == "user_agent.original") &&
384-
slices.Contains(userAgents, attr.Value.AsString()) {
385-
return true
386-
}
263+
if err := tp.ForceFlush(ctx); err != nil {
264+
log.WithContext(ctx).WithError(err).Error("Error flushing tracer provider")
387265
}
388-
return false
389-
}
390-
}
391-
392-
// ShouldSample evaluates rules in order and returns the first matching decision
393-
func (o *OvermindSampler) ShouldSample(parameters sdktrace.SamplingParameters) sdktrace.SamplingResult {
394-
for i, rule := range o.rules {
395-
if rule.ShouldSample(parameters) {
396-
// Use the pre-allocated sampler for this rule
397-
result := o.ruleSamplers[i].ShouldSample(parameters)
398-
if result.Decision == sdktrace.RecordAndSample {
399-
result.Attributes = append(result.Attributes,
400-
attribute.Int("SampleRate", rule.SampleRate))
401-
}
402-
return result
266+
if err := tp.Shutdown(ctx); err != nil {
267+
log.WithContext(ctx).WithError(err).Error("Error shutting down tracer provider")
403268
}
404269
}
405270

406-
// Default to AlwaysSample if no rules match
407-
return sdktrace.AlwaysSample().ShouldSample(parameters)
408-
}
409-
410-
// Description returns information describing the Sampler
411-
func (o *OvermindSampler) Description() string {
412-
return "Unified Overmind sampler combining multiple sampling strategies"
271+
log.WithContext(ctx).Trace("tracing has shut down")
413272
}
414273

415274
// Version returns the version baked into the binary at build time.

go/tracing/main_test.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,34 +21,28 @@ func TestTracingResource(t *testing.T) {
2121
}
2222
}
2323

24-
func TestShutdownBothProviders(t *testing.T) {
24+
func TestShutdownProvider(t *testing.T) {
2525
exp := tracetest.NewInMemoryExporter()
2626

2727
tp = sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp))
28-
healthTp = sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp))
2928

30-
if tp == nil || healthTp == nil {
31-
t.Fatal("expected both tp and healthTp to be non-nil after init")
29+
if tp == nil {
30+
t.Fatal("expected tp to be non-nil after init")
3231
}
3332

3433
ShutdownTracer(context.Background())
3534

36-
// After shutdown, calling Shutdown again on the providers should be a
37-
// safe no-op (the SDK guards with stopOnce). We do NOT nil the package
38-
// vars because concurrent callers (e.g. health probes) would panic.
35+
// After shutdown, calling Shutdown again should be a safe no-op
36+
// (the SDK guards with stopOnce).
3937
if err := tp.Shutdown(context.Background()); err != nil {
4038
t.Errorf("second tp.Shutdown should be a no-op, got: %v", err)
4139
}
42-
if err := healthTp.Shutdown(context.Background()); err != nil {
43-
t.Errorf("second healthTp.Shutdown should be a no-op, got: %v", err)
44-
}
4540
}
4641

4742
func TestShutdownIdempotent(t *testing.T) {
4843
exp := tracetest.NewInMemoryExporter()
4944

5045
tp = sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp))
51-
healthTp = sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp))
5246

5347
ShutdownTracer(context.Background())
5448
ShutdownTracer(context.Background()) // must not panic

0 commit comments

Comments
 (0)