Skip to content

Commit c86432b

Browse files
committed
Add support for responses and conversations API
1 parent 415bbcc commit c86432b

File tree

13 files changed

+1284
-36
lines changed

13 files changed

+1284
-36
lines changed

go.mod

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ require (
2626
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
2727
sigs.k8s.io/controller-runtime v0.22.5
2828
sigs.k8s.io/gateway-api v1.4.1
29-
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260128235548-fd30cb97714a
29+
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260203182229-e1cbdd1f76c5
3030
)
3131

3232
require (
@@ -107,13 +107,13 @@ require (
107107
github.com/xlab/treeprint v1.2.0 // indirect
108108
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
109109
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0 // indirect
110-
go.opentelemetry.io/otel v1.39.0 // indirect
110+
go.opentelemetry.io/otel v1.40.0 // indirect
111111
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
112112
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
113113
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 // indirect
114-
go.opentelemetry.io/otel/metric v1.39.0 // indirect
114+
go.opentelemetry.io/otel/metric v1.40.0 // indirect
115115
go.opentelemetry.io/otel/sdk v1.39.0 // indirect
116-
go.opentelemetry.io/otel/trace v1.39.0 // indirect
116+
go.opentelemetry.io/otel/trace v1.40.0 // indirect
117117
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
118118
go.uber.org/atomic v1.11.0 // indirect
119119
go.uber.org/multierr v1.11.0 // indirect

go.sum

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -321,22 +321,22 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ
321321
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
322322
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0 h1:ssfIgGNANqpVFCndZvcuyKbl0g+UAVcbBcqGkG28H0Y=
323323
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0/go.mod h1:GQ/474YrbE4Jx8gZ4q5I4hrhUzM6UPzyrqJYV2AqPoQ=
324-
go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48=
325-
go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8=
324+
go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms=
325+
go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g=
326326
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 h1:f0cb2XPmrqn4XMy9PNliTgRKJgS5WcL/u0/WRYGz4t0=
327327
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0/go.mod h1:vnakAaFckOMiMtOIhFI2MNH4FYrZzXCYxmb1LlhoGz8=
328328
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 h1:in9O8ESIOlwJAEGTkkf34DesGRAc/Pn8qJ7k3r/42LM=
329329
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0/go.mod h1:Rp0EXBm5tfnv0WL+ARyO/PHBEaEAT8UUHQ6AGJcSq6c=
330330
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 h1:8UPA4IbVZxpsD76ihGOQiFml99GPAEZLohDXvqHdi6U=
331331
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0/go.mod h1:MZ1T/+51uIVKlRzGw1Fo46KEWThjlCBZKl2LzY5nv4g=
332-
go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0=
333-
go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs=
332+
go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g=
333+
go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc=
334334
go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18=
335335
go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE=
336336
go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8=
337337
go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew=
338-
go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI=
339-
go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA=
338+
go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw=
339+
go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA=
340340
go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A=
341341
go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4=
342342
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
@@ -446,8 +446,8 @@ sigs.k8s.io/controller-runtime v0.22.5 h1:v3nfSUMowX/2WMp27J9slwGFyAt7IV0YwBxAkr
446446
sigs.k8s.io/controller-runtime v0.22.5/go.mod h1:pc5SoYWnWI6I+cBHYYdZ7B6YHZVY5xNfll88JB+vniI=
447447
sigs.k8s.io/gateway-api v1.4.1 h1:NPxFutNkKNa8UfLd2CMlEuhIPMQgDQ6DXNKG9sHbJU8=
448448
sigs.k8s.io/gateway-api v1.4.1/go.mod h1:AR5RSqciWP98OPckEjOjh2XJhAe2Na4LHyXD2FUY7Qk=
449-
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260128235548-fd30cb97714a h1:Ce5CZ0R3c5H475uEuJ92FMgux3j99wDrSsI4ivTBEXQ=
450-
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260128235548-fd30cb97714a/go.mod h1:lvMpB9a+Lk+xBi5Pk6teUG+NqA16WR8nRpmBNFJbflU=
449+
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260203182229-e1cbdd1f76c5 h1:JJzE5RL3y4ep3JCYF0e7djWDW6JvYVrO157kHgd1hF4=
450+
sigs.k8s.io/gateway-api-inference-extension v0.0.0-20260203182229-e1cbdd1f76c5/go.mod h1:5aE9jyjAIlAAfUFT22NDVyk9Ru8i2HSCCHMTXgKWNAo=
451451
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
452452
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
453453
sigs.k8s.io/kustomize/api v0.21.0 h1:I7nry5p8iDJbuRdYS7ez8MUvw7XVNPcIP5GkzzuXIIQ=

pkg/sidecar/proxy/chat_completions.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,5 @@ func (s *Server) chatCompletionsHandler(w http.ResponseWriter, r *http.Request)
7575
}
7676

7777
s.logger.V(4).Info("SSRF protection: prefill target allowed", "target", prefillHostPort)
78-
s.runConnectorProtocol(w, r, prefillHostPort)
78+
s.runConnectorProtocol(w, r, prefillHostPort, APITypeChatCompletions)
7979
}

pkg/sidecar/proxy/chat_completions_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ func TestServer_chatCompletionsHandler(t *testing.T) {
119119
s.prefillSamplerFn = func(n int) int { return i % n }
120120
// verify the hostPort value
121121
var hostPort string
122-
s.runConnectorProtocol = func(_ http.ResponseWriter, _ *http.Request, selectedHostPort string) { hostPort = selectedHostPort }
122+
s.runConnectorProtocol = func(_ http.ResponseWriter, _ *http.Request, selectedHostPort string, _ APIType) {
123+
hostPort = selectedHostPort
124+
}
123125
var passthrough bool
124126
s.decoderProxy = http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
125127
passthrough = true

pkg/sidecar/proxy/connector_lmcache.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,12 @@ import (
2323
"strings"
2424
)
2525

26-
func (s *Server) runLMCacheProtocol(w http.ResponseWriter, r *http.Request, prefillPodHostPort string) {
27-
s.logger.Info("running LMCache protocol")
26+
// runLMCacheProtocol handles the LMCache protocol for all OpenAI API types.
27+
// The apiType parameter determines which token limit fields to use:
28+
// - Chat Completions: max_tokens, max_completion_tokens
29+
// - Responses/Conversations: max_output_tokens
30+
func (s *Server) runLMCacheProtocol(w http.ResponseWriter, r *http.Request, prefillPodHostPort string, apiType APIType) {
31+
s.logger.Info("running LMCache protocol", "apiType", apiType)
2832

2933
// Read and parse request body
3034
defer r.Body.Close() //nolint:all
@@ -44,13 +48,16 @@ func (s *Server) runLMCacheProtocol(w http.ResponseWriter, r *http.Request, pref
4448
return
4549
}
4650

47-
// Create prefiller request. Set max_tokens to 1.
51+
// Create prefiller request. Set token limits to 1 based on API type.
4852

4953
ctx := r.Context()
5054
preq := r.Clone(ctx)
5155

52-
completionRequest[requestFieldMaxTokens] = 1
53-
completionRequest[requestFieldMaxCompletionTokens] = 1
56+
// Set token limits to 1 for prefill based on API type
57+
tokenFields := apiType.TokenLimitFields()
58+
for _, field := range tokenFields {
59+
completionRequest[field] = 1
60+
}
5461

5562
pbody, err := json.Marshal(completionRequest)
5663
if err != nil {

pkg/sidecar/proxy/connector_nixlv2.go

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@ import (
2525
"github.com/google/uuid"
2626
)
2727

28-
func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefillPodHostPort string) {
29-
s.logger.V(4).Info("running NIXL protocol V2", "url", prefillPodHostPort)
28+
// runNIXLProtocolV2 handles the NIXL v2 protocol for all OpenAI API types.
29+
// The apiType parameter determines which token limit fields to use:
30+
// - Chat Completions: max_tokens, max_completion_tokens
31+
// - Responses/Conversations: max_output_tokens
32+
func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefillPodHostPort string, apiType APIType) {
33+
s.logger.V(4).Info("running NIXL protocol V2", "url", prefillPodHostPort, "apiType", apiType)
3034

3135
// Read request body
3236
defer r.Body.Close() //nolint:all
@@ -64,10 +68,23 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
6468

6569
preq.Header.Add(requestHeaderRequestID, uuidStr)
6670

71+
// Save original values based on API type
6772
streamValue, streamOk := completionRequest[requestFieldStream]
6873
streamOptionsValue, streamOptionsOk := completionRequest[requestFieldStreamOptions]
69-
maxTokensValue, maxTokensOk := completionRequest[requestFieldMaxTokens]
70-
maxCompletionTokensValue, maxCompletionTokensOk := completionRequest[requestFieldMaxCompletionTokens]
74+
75+
// Save token limit fields based on API type
76+
tokenFields := apiType.TokenLimitFields()
77+
savedTokenValues := make(map[string]struct {
78+
value any
79+
ok bool
80+
})
81+
for _, field := range tokenFields {
82+
value, ok := completionRequest[field]
83+
savedTokenValues[field] = struct {
84+
value any
85+
ok bool
86+
}{value, ok}
87+
}
7188

7289
completionRequest[requestFieldKVTransferParams] = map[string]any{
7390
requestFieldDoRemoteDecode: true,
@@ -80,8 +97,11 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
8097

8198
completionRequest[requestFieldStream] = false
8299
delete(completionRequest, requestFieldStreamOptions)
83-
completionRequest[requestFieldMaxTokens] = 1
84-
completionRequest[requestFieldMaxCompletionTokens] = 1
100+
101+
// Set token limits to 1 for prefill based on API type
102+
for _, field := range tokenFields {
103+
completionRequest[field] = 1
104+
}
85105

86106
pbody, err := json.Marshal(completionRequest)
87107
if err != nil {
@@ -145,14 +165,15 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
145165
if streamOptionsOk {
146166
completionRequest[requestFieldStreamOptions] = streamOptionsValue
147167
}
148-
delete(completionRequest, requestFieldMaxTokens)
149-
if maxTokensOk {
150-
completionRequest[requestFieldMaxTokens] = maxTokensValue
151-
}
152-
delete(completionRequest, requestFieldMaxCompletionTokens)
153-
if maxCompletionTokensOk {
154-
completionRequest[requestFieldMaxCompletionTokens] = maxCompletionTokensValue
168+
169+
// Restore token limit fields based on API type
170+
for _, field := range tokenFields {
171+
delete(completionRequest, field)
172+
if saved := savedTokenValues[field]; saved.ok {
173+
completionRequest[field] = saved.value
174+
}
155175
}
176+
156177
completionRequest[requestFieldKVTransferParams] = pKVTransferParams
157178

158179
dbody, err := json.Marshal(completionRequest)

0 commit comments

Comments
 (0)