Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions e2e/pkg/testmatrix/testcases.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ var BaselineRouterContract = []string{
"decision-fallback-behavior",
"plugin-config-variations",
"chat-completions-progressive-stress",
// Session observability
"session-telemetry-metrics",
"session-pricing-chat-completions",
"session-pricing-response-api",
}

// DashboardContract is the canonical E2E contract for the dashboard API surface.
Expand Down
158 changes: 158 additions & 0 deletions e2e/testcases/session_pricing_e2e.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package testcases

import (
"context"
"fmt"
"net/http"
"strings"
"time"

"github.com/vllm-project/semantic-router/e2e/pkg/fixtures"
pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
"k8s.io/client-go/kubernetes"
)

func init() {
pkgtestcases.Register("session-pricing-chat-completions", pkgtestcases.TestCase{
Description: "After a routed chat completion, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
Tags: []string{"kubernetes", "observability", "metrics", "llm", "pricing"},
Fn: testSessionPricingChatCompletions,
})
pkgtestcases.Register("session-pricing-response-api", pkgtestcases.TestCase{
Description: "After a routed Response API call, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
Tags: []string{"kubernetes", "observability", "metrics", "llm", "pricing", "response-api"},
Fn: testSessionPricingResponseAPI,
})
Comment on lines +15 to +25
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These new test cases are registered, but they are not referenced by any testmatrix group or profile GetTestCases() list (search shows the names only appear in this file). As a result, they likely won’t run in CI unless manually selected. If these are meant to provide durable E2E coverage for session cost telemetry, add them to an appropriate testmatrix group/profile so the harness executes them by default.

Copilot generated this review using guidance from repository custom instructions.
}

// testSessionPricingChatCompletions verifies that after a Chat Completions request the
// llm_session_turn_cost histogram is present in /metrics (pricing must be configured
// for the routed model in router-config.yaml for the observation to appear).
func testSessionPricingChatCompletions(
ctx context.Context,
client *kubernetes.Clientset,
opts pkgtestcases.TestCaseOptions,
) error {
traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
if err != nil {
return err
}
defer traffic.Close()

metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
if err != nil {
return err
}
defer metricsSession.Close()

chat := fixtures.NewChatCompletionsClient(traffic, 60*time.Second)

headers := map[string]string{
"x-authz-user-id": "e2e-pricing-chat-user",
}
resp, err := chat.Create(ctx, fixtures.ChatCompletionsRequest{
Model: "MoM",
Messages: []fixtures.ChatMessage{
{Role: "user", Content: "Say hello in one short sentence for pricing telemetry."},
},
User: "e2e-pricing-chat-user",
}, headers)
if err != nil {
return err
}
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("chat completion: expected 200, got %d: %s", resp.StatusCode, string(resp.Body))
}

body, err := fetchMetrics(ctx, metricsSession)
if err != nil {
return err
}

// Token histograms from PR 1 must still be present.
if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens")
}
if !strings.Contains(body, "llm_session_turn_completion_tokens") {
return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens")
}
// Cost histogram descriptor must be registered (present even when no observations).
if !strings.Contains(body, "llm_session_turn_cost") {
return fmt.Errorf("metrics body missing llm_session_turn_cost")
}

if opts.SetDetails != nil {
opts.SetDetails(map[string]interface{}{
"chat_status": resp.StatusCode,
})
}
return nil
}

// testSessionPricingResponseAPI verifies that after a Response API request the
// llm_session_turn_cost histogram descriptor is exposed in /metrics.
func testSessionPricingResponseAPI(
ctx context.Context,
client *kubernetes.Clientset,
opts pkgtestcases.TestCaseOptions,
) error {
traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
if err != nil {
return err
}
defer traffic.Close()

metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
if err != nil {
return err
}
defer metricsSession.Close()

respAPI := fixtures.NewResponseAPIClient(traffic, 60*time.Second)

_, raw, err := respAPI.Create(ctx, fixtures.ResponseAPIRequest{
Model: "MoM",
Input: "Say hello in one short sentence for Response API pricing telemetry.",
})
if err != nil {
return fmt.Errorf("response api create: %w", err)
}
if raw.StatusCode != http.StatusOK {
return fmt.Errorf("response api: expected 200, got %d: %s", raw.StatusCode, string(raw.Body))
}

body, err := fetchMetrics(ctx, metricsSession)
if err != nil {
return err
}

if !strings.Contains(body, "llm_session_turn_cost") {
return fmt.Errorf("metrics body missing llm_session_turn_cost after Response API request")
}
if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens after Response API request")
}
if !strings.Contains(body, "llm_session_turn_completion_tokens") {
return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens after Response API request")
}

if opts.SetDetails != nil {
opts.SetDetails(map[string]interface{}{
"response_api_status": raw.StatusCode,
})
}
return nil
}

// fetchMetrics retrieves the Prometheus /metrics text from the router metrics port.
func fetchMetrics(ctx context.Context, metricsSession *fixtures.ServiceSession) (string, error) {
metricsHTTP := metricsSession.HTTPClient(15 * time.Second)
metricsResp, err := fixtures.DoGETRequest(ctx, metricsHTTP, metricsSession.URL("/metrics"))
if err != nil {
return "", fmt.Errorf("fetch /metrics: %w", err)
}
if metricsResp.StatusCode != http.StatusOK {
return "", fmt.Errorf("/metrics: expected 200, got %d", metricsResp.StatusCode)
}
return string(metricsResp.Body), nil
}
7 changes: 4 additions & 3 deletions src/semantic-router/pkg/config/model_config_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,10 @@ type ProviderProfile struct {
}

type ModelPricing struct {
Currency string `yaml:"currency,omitempty"`
PromptPer1M float64 `yaml:"prompt_per_1m,omitempty"`
CompletionPer1M float64 `yaml:"completion_per_1m,omitempty"`
Currency string `yaml:"currency,omitempty"`
PromptPer1M float64 `yaml:"prompt_per_1m,omitempty"`
CompletionPer1M float64 `yaml:"completion_per_1m,omitempty"`
CachedInputPer1M float64 `yaml:"cached_input_per_1m,omitempty"`
}

type ModelParams struct {
Expand Down
19 changes: 19 additions & 0 deletions src/semantic-router/pkg/config/pricing_helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package config

// GetFullModelPricing returns the complete ModelPricing entry for the given model,
// including CachedInputPer1M. Returns (p, true) when at least one rate is non-zero
// or Currency is explicitly set (currency-only counts as configured so that free/
// self-hosted models produce cost=0 telemetry). Returns (zero, false) when the model
// has no pricing entry at all. Accepts both short names and provider model IDs.
func (c *RouterConfig) GetFullModelPricing(modelName string) (ModelPricing, bool) {
if modelConfig, ok := c.resolveModelConfig(modelName); ok {
p := modelConfig.Pricing
if p.PromptPer1M != 0 || p.CompletionPer1M != 0 || p.CachedInputPer1M != 0 || p.Currency != "" {
if p.Currency == "" {
p.Currency = "USD"
}
return p, true
}
Comment on lines +3 to +16
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function comment says the second return value is false when "no pricing is configured", but the implementation treats currency alone (with all per-1M rates at 0) as configured and returns (p, true). Please clarify/align the definition of "configured" here (e.g., require at least one non-zero rate, or update the comment to state that currency-only counts as configured).

Copilot uses AI. Check for mistakes.
}
return ModelPricing{}, false
}
9 changes: 5 additions & 4 deletions src/semantic-router/pkg/extproc/processor_res_usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability/metrics"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/ratelimit"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/routerreplay"
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/sessiontelemetry"
)

type responseUsageMetrics struct {
Expand Down Expand Up @@ -60,7 +61,7 @@ func (r *OpenAIRouter) reportNonStreamingUsage(
}

if totalTokens > 0 {
recordSessionTurn(ctx, usage)
recordSessionTurn(ctx, usage, r.sessionTurnPricing(ctx.RequestModel))
}

if ctx.RequestModel == "" {
Expand Down Expand Up @@ -159,14 +160,14 @@ func extractStreamingUsage(ctx *RequestContext) openai.CompletionUsage {
return usage
}

func recordSessionTurnFromStreamingUsage(ctx *RequestContext, usage openai.CompletionUsage) {
func recordSessionTurnFromStreamingUsage(ctx *RequestContext, usage openai.CompletionUsage, pricing sessiontelemetry.TurnPricing) {
if usage.PromptTokens <= 0 && usage.CompletionTokens <= 0 {
return
}
recordSessionTurn(ctx, responseUsageMetrics{
promptTokens: int(usage.PromptTokens),
completionTokens: int(usage.CompletionTokens),
})
}, pricing)
}

func (r *OpenAIRouter) reportStreamingUsageMetrics(
Expand All @@ -181,7 +182,7 @@ func (r *OpenAIRouter) reportStreamingUsageMetrics(
})
}

recordSessionTurnFromStreamingUsage(ctx, usage)
recordSessionTurnFromStreamingUsage(ctx, usage, r.sessionTurnPricing(ctx.RequestModel))

if ctx.RequestModel == "" || (usage.PromptTokens == 0 && usage.CompletionTokens == 0) {
return
Expand Down
21 changes: 20 additions & 1 deletion src/semantic-router/pkg/extproc/session_telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,25 @@ import (
"github.com/vllm-project/semantic-router/src/semantic-router/pkg/sessiontelemetry"
)

func recordSessionTurn(ctx *RequestContext, usage responseUsageMetrics) {
// sessionTurnPricing looks up the active pricing for model from the router config
// and converts it to the sessiontelemetry value type.
func (r *OpenAIRouter) sessionTurnPricing(model string) sessiontelemetry.TurnPricing {
if r.Config == nil {
return sessiontelemetry.TurnPricing{}
}
p, ok := r.Config.GetFullModelPricing(model)
if !ok {
return sessiontelemetry.TurnPricing{}
}
return sessiontelemetry.TurnPricing{
Currency: p.Currency,
PromptPer1M: p.PromptPer1M,
CompletionPer1M: p.CompletionPer1M,
CachedInputPer1M: p.CachedInputPer1M,
}
}

func recordSessionTurn(ctx *RequestContext, usage responseUsageMetrics, pricing sessiontelemetry.TurnPricing) {
if ctx == nil || usage.promptTokens+usage.completionTokens <= 0 {
return
}
Expand All @@ -19,6 +37,7 @@ func recordSessionTurn(ctx *RequestContext, usage responseUsageMetrics) {
Domain: domain,
PromptTokens: usage.promptTokens,
CompletionTokens: usage.completionTokens,
Pricing: pricing,
}
if ctx.ResponseAPICtx != nil && ctx.ResponseAPICtx.IsResponseAPIRequest {
if ctx.ResponseAPICtx.ConversationID == "" {
Expand Down
35 changes: 35 additions & 0 deletions src/semantic-router/pkg/observability/metrics/session_cost.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts"
)

// SessionTurnCost tracks per-turn cost for sessions with pricing configured, labeled by
// model, VSR domain/category, and currency so non-USD deployments are represented correctly.
var SessionTurnCost = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "llm_session_turn_cost",
Help: "Distribution of per-turn cost attributed to a logical session (model + domain category + currency). Only recorded when pricing is configured.",
Buckets: []float64{0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0},
},
[]string{"model", "domain", "currency"},
)

// RecordSessionTurnCost records the per-turn cost histogram for sessions with pricing
// configured. Callers must only invoke this when pricing is active (cost == 0 for a
// free model is valid; cost == 0 because pricing is absent should not be recorded).
func RecordSessionTurnCost(model, domain, currency string, cost float64) {
if model == "" {
model = consts.UnknownLabel
}
if domain == "" {
domain = consts.UnknownLabel
}
if currency == "" {
currency = "USD"
}
SessionTurnCost.WithLabelValues(model, domain, currency).Observe(cost)
}
Loading
Loading