server upgrades

pythongiant · pythongiant · commit b098c0b49221 · 2026-05-06T21:27:30.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+opencode.json
 # Python
 __pycache__/
 *.py[cod]
@@ -39,4 +40,4 @@ docs/build/
 academic-research-skills/
 
 .copilot/
-KVBOOST_ALGORITHM.md
+KVBOOST_ALGORITHM.md
diff --git a/src/kvboost/server/__main__.py b/src/kvboost/server/__main__.py
@@ -45,7 +45,6 @@
 from __future__ import annotations
 
 import argparse
-import asyncio
 import logging
 import sys
 
@@ -138,11 +137,15 @@ def load_engine(args):
         )
     else:
         from ..engine import InferenceEngine
-        device = args.device
+        from ..compat import default_device
+        device = args.device or default_device()
+        # Load directly onto the target device. Avoid device_map="auto" because
+        # accelerate may offload modules to CPU/disk, after which InferenceEngine's
+        # subsequent model.to(device) call fails ("can't move offloaded modules").
         model = AutoModelForCausalLM.from_pretrained(
             args.model,
             torch_dtype=torch_dtype,
-            device_map=device or "auto",
+            device_map=device,
         )
         engine = InferenceEngine(
             model=model,
@@ -178,15 +181,13 @@ def main():
 
     engine = load_engine(args)
 
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     from .engine_worker import EngineWorker
     from .app import build_app
 
+    # Don't pre-create a loop here — uvicorn will create its own. EngineWorker
+    # captures the running loop in start() (the FastAPI startup hook).
     worker = EngineWorker(
         engine=engine,
-        loop=loop,
         max_workers=args.workers,
         batch_window_ms=args.batch_window_ms,
         max_batch_size=args.max_batch_size,
@@ -206,7 +207,6 @@ def main():
         app,
         host=args.host,
         port=args.port,
-        loop="none",  # use our own event loop
         log_level=args.log_level,
     )
 
diff --git a/src/kvboost/server/batch_queue.py b/src/kvboost/server/batch_queue.py
@@ -190,7 +190,7 @@ async def _collector_loop(self) -> None:
         while self._running:
             # Wait for at least one request
             try:
-                first = await asyncio.wait_for(self._queue.get(), timeout=1.0)
+                first = await asyncio.wait_for(self._queue.get(), timeout=10.0)
             except asyncio.TimeoutError:
                 continue
 
diff --git a/src/kvboost/server/engine_worker.py b/src/kvboost/server/engine_worker.py
@@ -49,24 +49,27 @@ class EngineWorker:
     Parameters
     ----------
     engine      : a fully initialised InferenceEngine (or subclass)
-    loop        : the asyncio event loop FastAPI is running on
     max_workers : thread-pool size (default 1 — model is not thread-safe)
     batch_window_ms  : collection window for the BatchQueue
     max_batch_size   : max requests per batch dispatch
     max_queue_size   : queue capacity before 503
+
+    The event loop is captured automatically when ``start()`` is awaited,
+    so the worker binds to whichever loop FastAPI/uvicorn is actually
+    running on.
     """
 
     def __init__(
         self,
         engine: InferenceEngine,
-        loop: asyncio.AbstractEventLoop,
+        loop: Optional[asyncio.AbstractEventLoop] = None,
         max_workers: int = 1,
         batch_window_ms: float = 20.0,
         max_batch_size: int = 8,
         max_queue_size: int = 256,
     ) -> None:
         self.engine = engine
-        self.loop = loop
+        self.loop = loop  # may be overridden in start() with the running loop
         self._executor = ThreadPoolExecutor(
             max_workers=max_workers,
             thread_name_prefix="kvboost-worker",
@@ -86,6 +89,10 @@ def __init__(
     # ── Lifecycle ─────────────────────────────────────────────────────────────
 
     async def start(self) -> None:
+        # Bind to the actual running loop (uvicorn creates its own when
+        # started with loop="none"). Doing this lazily avoids cross-loop
+        # Future errors if a stale loop was passed at construction time.
+        self.loop = asyncio.get_running_loop()
         await self.queue.start()
         log.info("EngineWorker started (model=%s)", self._model_name)