Skip to content

Commit b098c0b

Browse files
committed
server upgrades
1 parent 722eb3f commit b098c0b

4 files changed

Lines changed: 21 additions & 13 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
opencode.json
12
# Python
23
__pycache__/
34
*.py[cod]
@@ -39,4 +40,4 @@ docs/build/
3940
academic-research-skills/
4041

4142
.copilot/
42-
KVBOOST_ALGORITHM.md
43+
KVBOOST_ALGORITHM.md

src/kvboost/server/__main__.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
from __future__ import annotations
4646

4747
import argparse
48-
import asyncio
4948
import logging
5049
import sys
5150

@@ -138,11 +137,15 @@ def load_engine(args):
138137
)
139138
else:
140139
from ..engine import InferenceEngine
141-
device = args.device
140+
from ..compat import default_device
141+
device = args.device or default_device()
142+
# Load directly onto the target device. Avoid device_map="auto" because
143+
# accelerate may offload modules to CPU/disk, after which InferenceEngine's
144+
# subsequent model.to(device) call fails ("can't move offloaded modules").
142145
model = AutoModelForCausalLM.from_pretrained(
143146
args.model,
144147
torch_dtype=torch_dtype,
145-
device_map=device or "auto",
148+
device_map=device,
146149
)
147150
engine = InferenceEngine(
148151
model=model,
@@ -178,15 +181,13 @@ def main():
178181

179182
engine = load_engine(args)
180183

181-
loop = asyncio.new_event_loop()
182-
asyncio.set_event_loop(loop)
183-
184184
from .engine_worker import EngineWorker
185185
from .app import build_app
186186

187+
# Don't pre-create a loop here — uvicorn will create its own. EngineWorker
188+
# captures the running loop in start() (the FastAPI startup hook).
187189
worker = EngineWorker(
188190
engine=engine,
189-
loop=loop,
190191
max_workers=args.workers,
191192
batch_window_ms=args.batch_window_ms,
192193
max_batch_size=args.max_batch_size,
@@ -206,7 +207,6 @@ def main():
206207
app,
207208
host=args.host,
208209
port=args.port,
209-
loop="none", # use our own event loop
210210
log_level=args.log_level,
211211
)
212212

src/kvboost/server/batch_queue.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ async def _collector_loop(self) -> None:
190190
while self._running:
191191
# Wait for at least one request
192192
try:
193-
first = await asyncio.wait_for(self._queue.get(), timeout=1.0)
193+
first = await asyncio.wait_for(self._queue.get(), timeout=10.0)
194194
except asyncio.TimeoutError:
195195
continue
196196

src/kvboost/server/engine_worker.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,24 +49,27 @@ class EngineWorker:
4949
Parameters
5050
----------
5151
engine : a fully initialised InferenceEngine (or subclass)
52-
loop : the asyncio event loop FastAPI is running on
5352
max_workers : thread-pool size (default 1 — model is not thread-safe)
5453
batch_window_ms : collection window for the BatchQueue
5554
max_batch_size : max requests per batch dispatch
5655
max_queue_size : queue capacity before 503
56+
57+
The event loop is captured automatically when ``start()`` is awaited,
58+
so the worker binds to whichever loop FastAPI/uvicorn is actually
59+
running on.
5760
"""
5861

5962
def __init__(
6063
self,
6164
engine: InferenceEngine,
62-
loop: asyncio.AbstractEventLoop,
65+
loop: Optional[asyncio.AbstractEventLoop] = None,
6366
max_workers: int = 1,
6467
batch_window_ms: float = 20.0,
6568
max_batch_size: int = 8,
6669
max_queue_size: int = 256,
6770
) -> None:
6871
self.engine = engine
69-
self.loop = loop
72+
self.loop = loop # may be overridden in start() with the running loop
7073
self._executor = ThreadPoolExecutor(
7174
max_workers=max_workers,
7275
thread_name_prefix="kvboost-worker",
@@ -86,6 +89,10 @@ def __init__(
8689
# ── Lifecycle ─────────────────────────────────────────────────────────────
8790

8891
async def start(self) -> None:
92+
# Bind to the actual running loop (uvicorn creates its own when
93+
# started with loop="none"). Doing this lazily avoids cross-loop
94+
# Future errors if a stale loop was passed at construction time.
95+
self.loop = asyncio.get_running_loop()
8996
await self.queue.start()
9097
log.info("EngineWorker started (model=%s)", self._model_name)
9198

0 commit comments

Comments
 (0)