Skip to content

Commit 8d4df8c

Browse files
committed
fix max token decrement and streaming logic
Signed-off-by: RishabhSaini <rishabhsaini01@gmail.com>
1 parent 13f14a5 commit 8d4df8c

File tree

1 file changed

+16
-12
lines changed

1 file changed

+16
-12
lines changed

pkg/sidecar/proxy/connector_nixlv2.go

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,8 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
8686
maxCompletionTokensValue, maxCompletionTokensOk := completionRequest[requestFieldMaxCompletionTokens]
8787

8888
// Determine if client wants streaming
89-
clientWantsStreaming := false
90-
if streamOk {
91-
if streamBool, ok := streamValue.(bool); ok {
92-
clientWantsStreaming = streamBool
93-
}
94-
}
89+
streamBool, streamBoolOk := streamValue.(bool)
90+
clientWantsStreaming := streamOk && streamBoolOk && streamBool
9591

9692
completionRequest[requestFieldKVTransferParams] = map[string]any{
9793
requestFieldDoRemoteDecode: true,
@@ -260,18 +256,26 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
260256
}
261257
delete(completionRequest, requestFieldMaxTokens)
262258
if maxTokensOk {
263-
// Decrement by 1 since prefill already generated 1 token
264-
if val, ok := maxTokensValue.(float64); ok && val > 0 {
265-
completionRequest[requestFieldMaxTokens] = val - 1
259+
if clientWantsStreaming {
260+
// Decrement by 1 since we already sent 1 token to streaming client
261+
if val, ok := maxTokensValue.(float64); ok && val > 0 {
262+
completionRequest[requestFieldMaxTokens] = val - 1
263+
} else {
264+
completionRequest[requestFieldMaxTokens] = maxTokensValue
265+
}
266266
} else {
267267
completionRequest[requestFieldMaxTokens] = maxTokensValue
268268
}
269269
}
270270
delete(completionRequest, requestFieldMaxCompletionTokens)
271271
if maxCompletionTokensOk {
272-
// Decrement by 1 since prefill already generated 1 token
273-
if val, ok := maxCompletionTokensValue.(float64); ok && val > 0 {
274-
completionRequest[requestFieldMaxCompletionTokens] = val - 1
272+
if clientWantsStreaming {
273+
// Decrement by 1 since we already sent 1 token to streaming client
274+
if val, ok := maxCompletionTokensValue.(float64); ok && val > 0 {
275+
completionRequest[requestFieldMaxCompletionTokens] = val - 1
276+
} else {
277+
completionRequest[requestFieldMaxCompletionTokens] = maxCompletionTokensValue
278+
}
275279
} else {
276280
completionRequest[requestFieldMaxCompletionTokens] = maxCompletionTokensValue
277281
}

0 commit comments

Comments
 (0)