vllm-project · rootfs · Apr 10, 2026 · Apr 10, 2026 · Copilot · Apr 10, 2026
@@ -21,6 +21,10 @@ var BaselineRouterContract = []string{
 	"decision-fallback-behavior",
 	"plugin-config-variations",
 	"chat-completions-progressive-stress",
+	// Session observability
+	"session-telemetry-metrics",
+	"session-pricing-chat-completions",
+	"session-pricing-response-api",
 }
 
 // DashboardContract is the canonical E2E contract for the dashboard API surface.

@@ -0,0 +1,158 @@
+package testcases
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/vllm-project/semantic-router/e2e/pkg/fixtures"
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("session-pricing-chat-completions", pkgtestcases.TestCase{
+		Description: "After a routed chat completion, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
+		Tags:        []string{"kubernetes", "observability", "metrics", "llm", "pricing"},
+		Fn:          testSessionPricingChatCompletions,
+	})
+	pkgtestcases.Register("session-pricing-response-api", pkgtestcases.TestCase{
+		Description: "After a routed Response API call, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
+		Tags:        []string{"kubernetes", "observability", "metrics", "llm", "pricing", "response-api"},
+		Fn:          testSessionPricingResponseAPI,
+	})
+}
+
+// testSessionPricingChatCompletions verifies that after a Chat Completions request the
+// llm_session_turn_cost histogram is present in /metrics (pricing must be configured
+// for the routed model in router-config.yaml for the observation to appear).
+func testSessionPricingChatCompletions(
+	ctx context.Context,
+	client *kubernetes.Clientset,
+	opts pkgtestcases.TestCaseOptions,
+) error {
+	traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer traffic.Close()
+
+	metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer metricsSession.Close()
+
+	chat := fixtures.NewChatCompletionsClient(traffic, 60*time.Second)
+
+	headers := map[string]string{
+		"x-authz-user-id": "e2e-pricing-chat-user",
+	}
+	resp, err := chat.Create(ctx, fixtures.ChatCompletionsRequest{
+		Model: "MoM",
+		Messages: []fixtures.ChatMessage{
+			{Role: "user", Content: "Say hello in one short sentence for pricing telemetry."},
+		},
+		User: "e2e-pricing-chat-user",
+	}, headers)
+	if err != nil {
+		return err
+	}
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("chat completion: expected 200, got %d: %s", resp.StatusCode, string(resp.Body))
+	}
+
+	body, err := fetchMetrics(ctx, metricsSession)
+	if err != nil {
+		return err
+	}
+
+	// Token histograms from PR 1 must still be present.
+	if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens")
+	}
+	if !strings.Contains(body, "llm_session_turn_completion_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens")
+	}
+	// Cost histogram descriptor must be registered (present even when no observations).
+	if !strings.Contains(body, "llm_session_turn_cost") {
+		return fmt.Errorf("metrics body missing llm_session_turn_cost")
+	}
+
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"chat_status": resp.StatusCode,
+		})
+	}
+	return nil
+}
+
+// testSessionPricingResponseAPI verifies that after a Response API request the
+// llm_session_turn_cost histogram descriptor is exposed in /metrics.
+func testSessionPricingResponseAPI(
+	ctx context.Context,
+	client *kubernetes.Clientset,
+	opts pkgtestcases.TestCaseOptions,
+) error {
+	traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer traffic.Close()
+
+	metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer metricsSession.Close()
+
+	respAPI := fixtures.NewResponseAPIClient(traffic, 60*time.Second)
+
+	_, raw, err := respAPI.Create(ctx, fixtures.ResponseAPIRequest{
+		Model: "MoM",
+		Input: "Say hello in one short sentence for Response API pricing telemetry.",
+	})
+	if err != nil {
+		return fmt.Errorf("response api create: %w", err)
+	}
+	if raw.StatusCode != http.StatusOK {
+		return fmt.Errorf("response api: expected 200, got %d: %s", raw.StatusCode, string(raw.Body))
+	}
+
+	body, err := fetchMetrics(ctx, metricsSession)
+	if err != nil {
+		return err
+	}
+
+	if !strings.Contains(body, "llm_session_turn_cost") {
+		return fmt.Errorf("metrics body missing llm_session_turn_cost after Response API request")
+	}
+	if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens after Response API request")
+	}
+	if !strings.Contains(body, "llm_session_turn_completion_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens after Response API request")
+	}
+
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"response_api_status": raw.StatusCode,
+		})
+	}
+	return nil
+}
+
+// fetchMetrics retrieves the Prometheus /metrics text from the router metrics port.
+func fetchMetrics(ctx context.Context, metricsSession *fixtures.ServiceSession) (string, error) {
+	metricsHTTP := metricsSession.HTTPClient(15 * time.Second)
+	metricsResp, err := fixtures.DoGETRequest(ctx, metricsHTTP, metricsSession.URL("/metrics"))
+	if err != nil {
+		return "", fmt.Errorf("fetch /metrics: %w", err)
+	}
+	if metricsResp.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("/metrics: expected 200, got %d", metricsResp.StatusCode)
+	}
+	return string(metricsResp.Body), nil
+}
@@ -277,9 +277,10 @@ type ProviderProfile struct {
 }
 
 type ModelPricing struct {
-	Currency        string  `yaml:"currency,omitempty"`
-	PromptPer1M     float64 `yaml:"prompt_per_1m,omitempty"`
-	CompletionPer1M float64 `yaml:"completion_per_1m,omitempty"`
+	Currency         string  `yaml:"currency,omitempty"`
+	PromptPer1M      float64 `yaml:"prompt_per_1m,omitempty"`
+	CompletionPer1M  float64 `yaml:"completion_per_1m,omitempty"`
+	CachedInputPer1M float64 `yaml:"cached_input_per_1m,omitempty"`
 }
 
 type ModelParams struct {

@@ -0,0 +1,19 @@
+package config
+
+// GetFullModelPricing returns the complete ModelPricing entry for the given model,
+// including CachedInputPer1M. Returns (p, true) when at least one rate is non-zero
+// or Currency is explicitly set (currency-only counts as configured so that free/
+// self-hosted models produce cost=0 telemetry). Returns (zero, false) when the model
+// has no pricing entry at all. Accepts both short names and provider model IDs.
+func (c *RouterConfig) GetFullModelPricing(modelName string) (ModelPricing, bool) {
+	if modelConfig, ok := c.resolveModelConfig(modelName); ok {
+		p := modelConfig.Pricing
+		if p.PromptPer1M != 0 || p.CompletionPer1M != 0 || p.CachedInputPer1M != 0 || p.Currency != "" {
+			if p.Currency == "" {
+				p.Currency = "USD"
+			}
+			return p, true
+		}
+	}
+	return ModelPricing{}, false
+}
@@ -11,6 +11,7 @@ import (
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability/metrics"
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/ratelimit"
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/routerreplay"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/sessiontelemetry"
 )
 
 type responseUsageMetrics struct {
@@ -60,7 +61,7 @@ func (r *OpenAIRouter) reportNonStreamingUsage(
 	}
 
 	if totalTokens > 0 {
-		recordSessionTurn(ctx, usage)
+		recordSessionTurn(ctx, usage, r.sessionTurnPricing(ctx.RequestModel))
 	}
 
 	if ctx.RequestModel == "" {
@@ -159,14 +160,14 @@ func extractStreamingUsage(ctx *RequestContext) openai.CompletionUsage {
 	return usage
 }
 
-func recordSessionTurnFromStreamingUsage(ctx *RequestContext, usage openai.CompletionUsage) {
+func recordSessionTurnFromStreamingUsage(ctx *RequestContext, usage openai.CompletionUsage, pricing sessiontelemetry.TurnPricing) {
 	if usage.PromptTokens <= 0 && usage.CompletionTokens <= 0 {
 		return
 	}
 	recordSessionTurn(ctx, responseUsageMetrics{
 		promptTokens:     int(usage.PromptTokens),
 		completionTokens: int(usage.CompletionTokens),
-	})
+	}, pricing)
 }
 
 func (r *OpenAIRouter) reportStreamingUsageMetrics(
@@ -181,7 +182,7 @@ func (r *OpenAIRouter) reportStreamingUsageMetrics(
 		})
 	}
 
-	recordSessionTurnFromStreamingUsage(ctx, usage)
+	recordSessionTurnFromStreamingUsage(ctx, usage, r.sessionTurnPricing(ctx.RequestModel))
 
 	if ctx.RequestModel == "" || (usage.PromptTokens == 0 && usage.CompletionTokens == 0) {
 		return

@@ -5,7 +5,25 @@ import (
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/sessiontelemetry"
 )
 
-func recordSessionTurn(ctx *RequestContext, usage responseUsageMetrics) {
+// sessionTurnPricing looks up the active pricing for model from the router config
+// and converts it to the sessiontelemetry value type.
+func (r *OpenAIRouter) sessionTurnPricing(model string) sessiontelemetry.TurnPricing {
+	if r.Config == nil {
+		return sessiontelemetry.TurnPricing{}
+	}
+	p, ok := r.Config.GetFullModelPricing(model)
+	if !ok {
+		return sessiontelemetry.TurnPricing{}
+	}
+	return sessiontelemetry.TurnPricing{
+		Currency:         p.Currency,
+		PromptPer1M:      p.PromptPer1M,
+		CompletionPer1M:  p.CompletionPer1M,
+		CachedInputPer1M: p.CachedInputPer1M,
+	}
+}
+
+func recordSessionTurn(ctx *RequestContext, usage responseUsageMetrics, pricing sessiontelemetry.TurnPricing) {
 	if ctx == nil || usage.promptTokens+usage.completionTokens <= 0 {
 		return
 	}
@@ -19,6 +37,7 @@ func recordSessionTurn(ctx *RequestContext, usage responseUsageMetrics) {
 		Domain:           domain,
 		PromptTokens:     usage.promptTokens,
 		CompletionTokens: usage.completionTokens,
+		Pricing:          pricing,
 	}
 	if ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest {
 		if ctx.ResponseAPICtx.ConversationID == "" {

@@ -0,0 +1,35 @@
+package metrics
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts"
+)
+
+// SessionTurnCost tracks per-turn cost for sessions with pricing configured, labeled by
+// model, VSR domain/category, and currency so non-USD deployments are represented correctly.
+var SessionTurnCost = promauto.NewHistogramVec(
+	prometheus.HistogramOpts{
+		Name:    "llm_session_turn_cost",
+		Help:    "Distribution of per-turn cost attributed to a logical session (model + domain category + currency). Only recorded when pricing is configured.",
+		Buckets: []float64{0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0},
+	},
+	[]string{"model", "domain", "currency"},
+)
+
+// RecordSessionTurnCost records the per-turn cost histogram for sessions with pricing
+// configured. Callers must only invoke this when pricing is active (cost == 0 for a
+// free model is valid; cost == 0 because pricing is absent should not be recorded).
+func RecordSessionTurnCost(model, domain, currency string, cost float64) {
+	if model == "" {
+		model = consts.UnknownLabel
+	}
+	if domain == "" {
+		domain = consts.UnknownLabel
+	}
+	if currency == "" {
+		currency = "USD"
+	}
+	SessionTurnCost.WithLabelValues(model, domain, currency).Observe(cost)
+}