@@ -665,6 +665,102 @@ async def cancel_responses(response_id: str, raw_request: Request):
665665 return JSONResponse (content = response .model_dump ())
666666
667667
668+ if envs .VLLM_V1_SPANS_ENABLED :
669+ import spnl
670+ import time
671+ from fastapi import Body
672+ from vllm import SamplingParams
673+ from vllm .inputs import TokensPrompt
674+ from vllm .outputs import RequestOutput
675+ from vllm .entrypoints .openai .protocol import (ChatMessage ,ChatCompletionResponseChoice ,UsageInfo )
676+ spnl_state = spnl .init (10 )
677+ PAD_TOKEN = 27
678+ PLUS_TOKEN = envs .VLLM_V1_SPANS_TOKEN_PLUS if envs .VLLM_V1_SPANS_TOKEN_PLUS >= 0 else None
679+ CROSS_TOKEN = envs .VLLM_V1_SPANS_TOKEN_CROSS if envs .VLLM_V1_SPANS_TOKEN_CROSS >= 0 else None
680+ def wrap (prompt : str | list [str ]) -> TokensPrompt :
681+ if isinstance (prompt [0 ], list ):
682+ return [TokensPrompt (prompt_token_ids = p ) for p in prompt ]
683+ return TokensPrompt (prompt_token_ids = prompt )
684+ @router .post ("/v1/query/prepare" )
685+ @with_cancellation
686+ @load_aware_call
687+ async def prepare_query (raw_request : Request ,
688+ query : str = Body (..., media_type = "text/plain" )):
689+ docs = [wrap (doc ) for doc in spnl .tokenize_prepare (
690+ spnl_state ,
691+ query ,
692+ True , # we need to preload the prefix of the plus/independent spans
693+ PAD_TOKEN ,
694+ PLUS_TOKEN ,
695+ raw_request .app .state .vllm_config .cache_config .block_size
696+ )]
697+
698+ request_id = raw_request .headers .get (
699+ "X-Request-Id" ) or uuid .uuid4 ().hex
700+ client = engine_client (raw_request )
701+ generators = [client .generate (doc , SamplingParams (temperature = 0 ,max_tokens = 1 ), request_id ) for doc in docs ]
702+ for generator in generators :
703+ async for res in generator :
704+ final = res .outputs [0 ]
705+
706+ if isinstance (generator , ErrorResponse ):
707+ return JSONResponse (content = generator .model_dump (),
708+ status_code = generator .error .code )
709+ return JSONResponse (content = {"success" : True })
710+
711+ @router .post ("/v1/query/execute" )
712+ @with_cancellation
713+ @load_aware_call
714+ async def execute_query (raw_request : Request ,
715+ query : str = Body (..., media_type = "text/plain" )):
716+ req = spnl .tokenize_query (
717+ spnl_state ,
718+ query ,
719+ PAD_TOKEN ,
720+ CROSS_TOKEN ,
721+ PLUS_TOKEN ,
722+ raw_request .app .state .vllm_config .cache_config .block_size
723+ )
724+
725+ request_id = raw_request .headers .get (
726+ "X-Request-Id" ) or uuid .uuid4 ().hex
727+ client = engine_client (raw_request )
728+ generator = client .generate (wrap (req .messages ), SamplingParams (n = 1 if req .n <= 0 else req .n ,temperature = req .temperature if req .temperature is not None else 0 ,max_tokens = req .max_tokens if req .max_tokens is not None and req .max_tokens != 0 else 2048 ), request_id )
729+
730+ # TODO streaming output...
731+ outputs : List [Optional [CompletionOutput ]] = [None for _ in range (req .n )]
732+ async for res in generator :
733+ for output in res .outputs :
734+ outputs [output .index ] = output
735+ choices = [
736+ ChatCompletionResponseChoice (
737+ index = index ,
738+ message = ChatMessage (role = "assistant" , content = output .text ),
739+ logprobs = output .logprobs ,
740+ finish_reason = output .finish_reason ,
741+ stop_reason = output .stop_reason ,
742+ )
743+ for index , output in enumerate (outputs )
744+ ]
745+ num_prompt_tokens = 0 # TODO
746+ num_generated_tokens = 0 # TODO
747+ usage = UsageInfo (prompt_tokens = num_prompt_tokens ,
748+ completion_tokens = num_generated_tokens ,
749+ total_tokens = num_prompt_tokens +
750+ num_generated_tokens )
751+ response = ChatCompletionResponse (
752+ id = request_id ,
753+ created = int (time .time ()),
754+ model = req .model ,
755+ choices = choices ,
756+ usage = usage
757+ )
758+
759+ if isinstance (generator , ErrorResponse ):
760+ return JSONResponse (content = generator .model_dump (),
761+ status_code = generator .error .code )
762+ return JSONResponse (content = response .model_dump ())
763+
668764@router .post ("/v1/chat/completions" ,
669765 dependencies = [Depends (validate_json_request )],
670766 responses = {
0 commit comments