Disaggregation works

vinayakdsci · vinayakdsci · commit d0e17de69bd4 · 2025-06-04T14:51:47.000Z
diff --git a/shortfin/python/shortfin_apps/llm/cli.py b/shortfin/python/shortfin_apps/llm/cli.py
@@ -82,12 +82,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default=1,
         help="Number of workers to use when running in `offline` mode.",
     )
-    parser.add_argument(
-        "--disaggregate",
-        action="store_true",
-        help="Disaggregate the prefill and decode invocations to separate HIP streams.",
-    )
-    
+
 
 def parse_args(argv):
     parser = argparse.ArgumentParser()
diff --git a/shortfin/python/shortfin_apps/llm/components/batcher.py b/shortfin/python/shortfin_apps/llm/components/batcher.py
@@ -69,6 +69,7 @@ def __init__(
         self.scheduler = Scheduler(ideal_batch_size=self.ideal_batch_size)
         self.cache = DeviceArrayCache(fiber.device(0))
         self.program_isolation = program_isolation
+        self.exec_fiber = exec_fiber
 
     def handle_inference_request(self, request):
         """Handle an inference request."""
@@ -120,7 +121,7 @@ async def board_flights(self):
         scheduled = []
         for job in to_schedule:
             scheduled = scheduled + job
-            self.board(cache, self.fiber, job)
+            self.board(cache, self.exec_fiber, job)
             logger.debug("Post boarding cache state: %r", cache)
 
         pending = set(pending) - set(scheduled)
diff --git a/shortfin/python/shortfin_apps/llm/components/generate.py b/shortfin/python/shortfin_apps/llm/components/generate.py
@@ -88,23 +88,28 @@ def __init__(
         )
         self.streamed_tokens_index = 0
         self._status_tracker = status_tracker
-
-    async def run(self):
-        exec_req = LlmInferenceExecRequest(
+        self.exec_req = LlmInferenceExecRequest(
             phase=InferencePhase.PREFILL,
             input_token_ids=self.input_token_ids,
             rid=self.gen_req.rid,
             status_tracker=self._status_tracker,
         )
-        exec_req._cache = self.client.prefill_batcher.page_cache
+
+    async def run(self):
+        self.exec_req._cache = self.client.prefill_batcher.page_cache
         try:
             # Prefill result.
-            await self.token_selector.prefill(exec_req)
+            await self.token_selector.prefill(self.exec_req)
 
             # Decode loop.
-            await self.token_selector.decode(exec_req)
+            await self.token_selector.decode(self.exec_req)
         finally:
-            exec_req.free_cache_pages()
+            self.exec_req.completed.set_success()
+            self.exec_req.free_cache_pages()
+
+    async def await_completion(self):
+        await self.exec_req.completed
+        return self.index
 
     def results_callback(self, result: int | list[list[int]]):
         if is_multi_response(self.decode_config):
@@ -225,6 +230,7 @@ async def run(self):
             else:
                 input_batch = self.tokenize()
 
+            pending = []
             for index, input_tokens in enumerate(input_batch):
                 decode_config = copy(self.decode_config)
                 decode_config.update_from_sampling_params(
@@ -273,11 +279,17 @@ async def run(self):
                     fiber=fiber,
                 )
                 gen_processes.append(gen_process)
+                pending.append(asyncio.create_task(gen_process.await_completion()))
                 gen_process.launch()
 
-            await asyncio.gather(*gen_processes)
-            if not self.responder.is_disconnected():
-                self.generate_response(gen_processes, streaming)
+            while pending:
+                done, pending = await asyncio.wait(
+                    pending, return_when=asyncio.FIRST_COMPLETED
+                )
+                for task in done:
+                    idx = await task
+                    if not self.responder.is_disconnected():
+                        self.generate_response([gen_processes[idx]], streaming)
         finally:
             # Remove request from queue when done
             self.service.remove_from_queue(self.decode_config.num_beams)
diff --git a/shortfin/python/shortfin_apps/llm/components/messages.py b/shortfin/python/shortfin_apps/llm/components/messages.py
@@ -37,6 +37,7 @@ def __init__(
         self.input_token_ids = input_token_ids
         self.prompt_length = len(input_token_ids)
         self.done = sf.VoidFuture()
+        self.completed = sf.VoidFuture()
         self.rid = rid
         # Unique `instance_id` for token selection strategies that may need
         # to differentiate between an original req and a copy of a req.
diff --git a/shortfin/python/shortfin_apps/llm/server.py b/shortfin/python/shortfin_apps/llm/server.py
@@ -128,6 +128,11 @@ def add_service_args(parser: argparse.ArgumentParser):
         default=1,
         help="Number of fibers to use per worker.",
     )
+    parser.add_argument(
+        "--disaggregate",
+        action="store_true",
+        help="Disaggregate the prefill and decode invocations to separate HIP streams.",
+    )
 
 
 def parse_args(argv):