-
Notifications
You must be signed in to change notification settings - Fork 615
feat(observability): stamp per-turn pricing metadata and cumulative cost onto session log #1740
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,158 @@ | ||
| package testcases | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "net/http" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "github.com/vllm-project/semantic-router/e2e/pkg/fixtures" | ||
| pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases" | ||
| "k8s.io/client-go/kubernetes" | ||
| ) | ||
|
|
||
| func init() { | ||
| pkgtestcases.Register("session-pricing-chat-completions", pkgtestcases.TestCase{ | ||
| Description: "After a routed chat completion, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured", | ||
| Tags: []string{"kubernetes", "observability", "metrics", "llm", "pricing"}, | ||
| Fn: testSessionPricingChatCompletions, | ||
| }) | ||
| pkgtestcases.Register("session-pricing-response-api", pkgtestcases.TestCase{ | ||
| Description: "After a routed Response API call, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured", | ||
| Tags: []string{"kubernetes", "observability", "metrics", "llm", "pricing", "response-api"}, | ||
| Fn: testSessionPricingResponseAPI, | ||
| }) | ||
| } | ||
|
|
||
| // testSessionPricingChatCompletions verifies that after a Chat Completions request the | ||
| // llm_session_turn_cost histogram is present in /metrics (pricing must be configured | ||
| // for the routed model in router-config.yaml for the observation to appear). | ||
| func testSessionPricingChatCompletions( | ||
| ctx context.Context, | ||
| client *kubernetes.Clientset, | ||
| opts pkgtestcases.TestCaseOptions, | ||
| ) error { | ||
| traffic, err := fixtures.OpenServiceSession(ctx, client, opts) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer traffic.Close() | ||
|
|
||
| metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer metricsSession.Close() | ||
|
|
||
| chat := fixtures.NewChatCompletionsClient(traffic, 60*time.Second) | ||
|
|
||
| headers := map[string]string{ | ||
| "x-authz-user-id": "e2e-pricing-chat-user", | ||
| } | ||
| resp, err := chat.Create(ctx, fixtures.ChatCompletionsRequest{ | ||
| Model: "MoM", | ||
| Messages: []fixtures.ChatMessage{ | ||
| {Role: "user", Content: "Say hello in one short sentence for pricing telemetry."}, | ||
| }, | ||
| User: "e2e-pricing-chat-user", | ||
| }, headers) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| if resp.StatusCode != http.StatusOK { | ||
| return fmt.Errorf("chat completion: expected 200, got %d: %s", resp.StatusCode, string(resp.Body)) | ||
| } | ||
|
|
||
| body, err := fetchMetrics(ctx, metricsSession) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // Token histograms from PR 1 must still be present. | ||
| if !strings.Contains(body, "llm_session_turn_prompt_tokens") { | ||
| return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens") | ||
| } | ||
| if !strings.Contains(body, "llm_session_turn_completion_tokens") { | ||
| return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens") | ||
| } | ||
| // Cost histogram descriptor must be registered (present even when no observations). | ||
| if !strings.Contains(body, "llm_session_turn_cost") { | ||
| return fmt.Errorf("metrics body missing llm_session_turn_cost") | ||
| } | ||
|
|
||
| if opts.SetDetails != nil { | ||
| opts.SetDetails(map[string]interface{}{ | ||
| "chat_status": resp.StatusCode, | ||
| }) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // testSessionPricingResponseAPI verifies that after a Response API request the | ||
| // llm_session_turn_cost histogram descriptor is exposed in /metrics. | ||
| func testSessionPricingResponseAPI( | ||
| ctx context.Context, | ||
| client *kubernetes.Clientset, | ||
| opts pkgtestcases.TestCaseOptions, | ||
| ) error { | ||
| traffic, err := fixtures.OpenServiceSession(ctx, client, opts) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer traffic.Close() | ||
|
|
||
| metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer metricsSession.Close() | ||
|
|
||
| respAPI := fixtures.NewResponseAPIClient(traffic, 60*time.Second) | ||
|
|
||
| _, raw, err := respAPI.Create(ctx, fixtures.ResponseAPIRequest{ | ||
| Model: "MoM", | ||
| Input: "Say hello in one short sentence for Response API pricing telemetry.", | ||
| }) | ||
| if err != nil { | ||
| return fmt.Errorf("response api create: %w", err) | ||
| } | ||
| if raw.StatusCode != http.StatusOK { | ||
| return fmt.Errorf("response api: expected 200, got %d: %s", raw.StatusCode, string(raw.Body)) | ||
| } | ||
|
|
||
| body, err := fetchMetrics(ctx, metricsSession) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| if !strings.Contains(body, "llm_session_turn_cost") { | ||
| return fmt.Errorf("metrics body missing llm_session_turn_cost after Response API request") | ||
| } | ||
| if !strings.Contains(body, "llm_session_turn_prompt_tokens") { | ||
| return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens after Response API request") | ||
| } | ||
| if !strings.Contains(body, "llm_session_turn_completion_tokens") { | ||
| return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens after Response API request") | ||
| } | ||
|
|
||
| if opts.SetDetails != nil { | ||
| opts.SetDetails(map[string]interface{}{ | ||
| "response_api_status": raw.StatusCode, | ||
| }) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // fetchMetrics retrieves the Prometheus /metrics text from the router metrics port. | ||
| func fetchMetrics(ctx context.Context, metricsSession *fixtures.ServiceSession) (string, error) { | ||
| metricsHTTP := metricsSession.HTTPClient(15 * time.Second) | ||
| metricsResp, err := fixtures.DoGETRequest(ctx, metricsHTTP, metricsSession.URL("/metrics")) | ||
| if err != nil { | ||
| return "", fmt.Errorf("fetch /metrics: %w", err) | ||
| } | ||
| if metricsResp.StatusCode != http.StatusOK { | ||
| return "", fmt.Errorf("/metrics: expected 200, got %d", metricsResp.StatusCode) | ||
| } | ||
| return string(metricsResp.Body), nil | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| package config | ||
|
|
||
| // GetFullModelPricing returns the complete ModelPricing entry for the given model, | ||
| // including CachedInputPer1M. Returns (p, true) when at least one rate is non-zero | ||
| // or Currency is explicitly set (currency-only counts as configured so that free/ | ||
| // self-hosted models produce cost=0 telemetry). Returns (zero, false) when the model | ||
| // has no pricing entry at all. Accepts both short names and provider model IDs. | ||
| func (c *RouterConfig) GetFullModelPricing(modelName string) (ModelPricing, bool) { | ||
| if modelConfig, ok := c.resolveModelConfig(modelName); ok { | ||
| p := modelConfig.Pricing | ||
| if p.PromptPer1M != 0 || p.CompletionPer1M != 0 || p.CachedInputPer1M != 0 || p.Currency != "" { | ||
| if p.Currency == "" { | ||
| p.Currency = "USD" | ||
| } | ||
| return p, true | ||
| } | ||
|
Comment on lines
+3
to
+16
|
||
| } | ||
| return ModelPricing{}, false | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| package metrics | ||
|
|
||
| import ( | ||
| "github.com/prometheus/client_golang/prometheus" | ||
| "github.com/prometheus/client_golang/prometheus/promauto" | ||
|
|
||
| "github.com/vllm-project/semantic-router/src/semantic-router/pkg/consts" | ||
| ) | ||
|
|
||
| // SessionTurnCost tracks per-turn cost for sessions with pricing configured, labeled by | ||
| // model, VSR domain/category, and currency so non-USD deployments are represented correctly. | ||
| var SessionTurnCost = promauto.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "llm_session_turn_cost", | ||
| Help: "Distribution of per-turn cost attributed to a logical session (model + domain category + currency). Only recorded when pricing is configured.", | ||
| Buckets: []float64{0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0}, | ||
| }, | ||
| []string{"model", "domain", "currency"}, | ||
| ) | ||
|
|
||
| // RecordSessionTurnCost records the per-turn cost histogram for sessions with pricing | ||
| // configured. Callers must only invoke this when pricing is active (cost == 0 for a | ||
| // free model is valid; cost == 0 because pricing is absent should not be recorded). | ||
| func RecordSessionTurnCost(model, domain, currency string, cost float64) { | ||
| if model == "" { | ||
| model = consts.UnknownLabel | ||
| } | ||
| if domain == "" { | ||
| domain = consts.UnknownLabel | ||
| } | ||
| if currency == "" { | ||
| currency = "USD" | ||
| } | ||
| SessionTurnCost.WithLabelValues(model, domain, currency).Observe(cost) | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These new test cases are registered, but they are not referenced by any testmatrix group or profile
GetTestCases()list (search shows the names only appear in this file). As a result, they likely won’t run in CI unless manually selected. If these are meant to provide durable E2E coverage for session cost telemetry, add them to an appropriate testmatrix group/profile so the harness executes them by default.