@@ -86,12 +86,8 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
8686 maxCompletionTokensValue , maxCompletionTokensOk := completionRequest [requestFieldMaxCompletionTokens ]
8787
8888 // Determine if client wants streaming
89- clientWantsStreaming := false
90- if streamOk {
91- if streamBool , ok := streamValue .(bool ); ok {
92- clientWantsStreaming = streamBool
93- }
94- }
89+ streamBool , streamBoolOk := streamValue .(bool )
90+ clientWantsStreaming := streamOk && streamBoolOk && streamBool
9591
9692 completionRequest [requestFieldKVTransferParams ] = map [string ]any {
9793 requestFieldDoRemoteDecode : true ,
@@ -260,18 +256,26 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
260256 }
261257 delete (completionRequest , requestFieldMaxTokens )
262258 if maxTokensOk {
263- // Decrement by 1 since prefill already generated 1 token
264- if val , ok := maxTokensValue .(float64 ); ok && val > 0 {
265- completionRequest [requestFieldMaxTokens ] = val - 1
259+ if clientWantsStreaming {
260+ // Decrement by 1 since we already sent 1 token to streaming client
261+ if val , ok := maxTokensValue .(float64 ); ok && val > 0 {
262+ completionRequest [requestFieldMaxTokens ] = val - 1
263+ } else {
264+ completionRequest [requestFieldMaxTokens ] = maxTokensValue
265+ }
266266 } else {
267267 completionRequest [requestFieldMaxTokens ] = maxTokensValue
268268 }
269269 }
270270 delete (completionRequest , requestFieldMaxCompletionTokens )
271271 if maxCompletionTokensOk {
272- // Decrement by 1 since prefill already generated 1 token
273- if val , ok := maxCompletionTokensValue .(float64 ); ok && val > 0 {
274- completionRequest [requestFieldMaxCompletionTokens ] = val - 1
272+ if clientWantsStreaming {
273+ // Decrement by 1 since we already sent 1 token to streaming client
274+ if val , ok := maxCompletionTokensValue .(float64 ); ok && val > 0 {
275+ completionRequest [requestFieldMaxCompletionTokens ] = val - 1
276+ } else {
277+ completionRequest [requestFieldMaxCompletionTokens ] = maxCompletionTokensValue
278+ }
275279 } else {
276280 completionRequest [requestFieldMaxCompletionTokens ] = maxCompletionTokensValue
277281 }
0 commit comments