Fix: requests get duplicated using shared_prefix datagen when multi-turn chat disabled (kubernetes-sigs#293)

huaxig · web-flow · commit c85e5a4647ee · 2025-11-25T15:46:22.000-08:00
### Background A bug was causing requests to be duplicated when using the shared_prefix with multi-turn chat disabled. This happened because the load generator was creating a standalone request queue for each worker, and then broadcasting each incoming request to all worker queues. ### Fix The fix ensures that this standalone queue feature is disabled when multi-turn chat is not active, preventing the duplication of requests. ### Credit Credit for reporting this bug goes to @diamondburned
diff --git a/inference_perf/datagen/shared_prefix_datagen.py b/inference_perf/datagen/shared_prefix_datagen.py
@@ -60,7 +60,7 @@ def is_shared_prefix_supported(self) -> bool:
         return True
 
     def is_prefered_worker_requested(self) -> bool:
-        return True
+        return True if self.enable_multi_turn_chat else False
 
     def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
         i = data.data_index % len(self.prompts)