From b2146fc1c663442b3fd02de39d36431371da21c6 Mon Sep 17 00:00:00 2001 From: Chessing234 Date: Thu, 16 Apr 2026 05:26:10 +0530 Subject: [PATCH] Fix VLLMWorker.generate_stream referencing module-level engine instead of self VLLMWorker.__init__ accepts llm_engine: AsyncLLMEngine and uses it to populate self.tokenizer / self.context_len, but never stores it on self. The generate_stream method then calls results_generator = engine.generate(...) ... await engine.abort(...) referring to a module-level engine that only exists when the file is executed via __main__ (line 290: `engine = AsyncLLMEngine.from_engine_args(...)`). When VLLMWorker is imported and instantiated by other code paths, those references raise NameError. Bind the engine to the instance (self.llm_engine = llm_engine) and route the two call sites through self.llm_engine. Behavior under the __main__ entry point is preserved because __init__ now stores the same engine that was previously read from the global. --- fastchat/serve/vllm_worker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py index 0af680bb5..3614272ef 100644 --- a/fastchat/serve/vllm_worker.py +++ b/fastchat/serve/vllm_worker.py @@ -54,6 +54,7 @@ def __init__( logger.info( f"Loading the model {self.model_names} on worker {worker_id}, worker type: vLLM worker..." ) + self.llm_engine = llm_engine self.tokenizer = llm_engine.engine.tokenizer # This is to support vllm >= 0.2.7 where TokenizerGroup was introduced # and llm_engine.engine.tokenizer was no longer a raw tokenizer @@ -116,7 +117,7 @@ async def generate_stream(self, params): frequency_penalty=frequency_penalty, best_of=best_of, ) - results_generator = engine.generate(context, sampling_params, request_id) + results_generator = self.llm_engine.generate(context, sampling_params, request_id) async for request_output in results_generator: prompt = request_output.prompt @@ -135,7 +136,7 @@ async def generate_stream(self, params): aborted = False if request and await request.is_disconnected(): - await engine.abort(request_id) + await self.llm_engine.abort(request_id) request_output.finished = True aborted = True for output in request_output.outputs: