@@ -86,17 +86,28 @@ async def process_func(
8686 Process a single request.
8787 """
8888
89- # Create fresh SamplingParams to avoid vLLM overflow bug when reusing
90- sampling_params = SamplingParams (max_tokens = self .sampling_params .max_tokens )
89+ # Extract params from kwargs - must pass to constructor, not mutate after,
90+ # because SamplingParams.__post_init__ sets skip_reading_prefix_cache based
91+ # on prompt_logprobs, and mutation after construction skips this.
92+ logprobs = None
93+ prompt_logprobs = None
94+ max_tokens = self .sampling_params .max_tokens
95+ temperature = 1.0
9196 for kwarg in kwargs :
9297 if "logprobs" in kwarg :
93- sampling_params . logprobs = kwarg ["top_logprobs" ]
98+ logprobs = kwarg ["top_logprobs" ]
9499 if "prompt_logprobs" in kwarg :
95- sampling_params . prompt_logprobs = kwarg ["prompt_logprobs" ]
100+ prompt_logprobs = kwarg ["prompt_logprobs" ]
96101 if "max_tokens" in kwarg :
97- sampling_params . max_tokens = kwarg ["max_tokens" ]
102+ max_tokens = kwarg ["max_tokens" ]
98103 if "temperature" in kwarg :
99- sampling_params .temperature = kwarg ["temperature" ]
104+ temperature = kwarg ["temperature" ]
105+ sampling_params = SamplingParams (
106+ max_tokens = max_tokens ,
107+ logprobs = logprobs ,
108+ prompt_logprobs = prompt_logprobs ,
109+ temperature = temperature ,
110+ )
100111 loop = asyncio .get_running_loop ()
101112 prompts = []
102113 statistics = []
0 commit comments