feat: split api.py from ps.py

specture724 · specture724 · commit e48c44284f2e · 2025-12-11T08:42:56.000Z
diff --git a/checkpoint_engine/api.py b/checkpoint_engine/api.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import httpx
+from loguru import logger
+from pydantic import BaseModel
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from checkpoint_engine.ps import ParameterServer
+
+
+def request_inference_to_update(
+    url: str,
+    socket_paths: dict[str, str],
+    timeout: float = 300.0,
+    uds: str | None = None,
+):
+    """Send an inference update request to inference server via HTTP or Unix socket.
+
+    Args:
+        url (str): The HTTP URL or request path (e.g., "http://localhost:19730/inference") to send the request to.
+        socket_paths (dict[str, str]): A dictionary containing device uuid and IPC socket paths for updating weights.
+        timeout (float, optional): Request timeout in seconds. Defaults to 300.0.
+        uds (str, optional): Path to a Unix domain socket. If provided, the request
+            will be sent via the Unix socket instead of HTTP. Defaults to None.
+
+    Raises:
+        httpx.HTTPStatusError: If the response contains an HTTP error status.
+        httpx.RequestError: If there was an issue while making the request.
+    """
+    resp = httpx.Client(transport=httpx.HTTPTransport(uds=uds)).post(
+        url,
+        json={
+            "method": "update_weights_from_ipc",
+            "args": [socket_paths],
+            "timeout": timeout,
+        },
+        timeout=timeout,
+    )
+    resp.raise_for_status()
+
+
+def _init_api(ps: ParameterServer) -> Any:
+    import fastapi
+    from fastapi import Request
+    from fastapi.responses import JSONResponse, Response
+
+    app = fastapi.FastAPI()
+
+    class RegisterRequest(BaseModel):
+        files: list[str]
+
+    class UpdateRequest(BaseModel):
+        ranks: list[int] = []
+        update_url: str | None = None
+        inference_group_ranks: list[int] = []
+        timeout: float = 300.0
+        uds: str | None = None
+
+    def wrap_exception(func: Callable[[], None]) -> Response:
+        try:
+            func()
+        except Exception as e:  # noqa: BLE001
+            logger.exception(f"wrap exception {func} failed")
+            return JSONResponse(content=str(e), status_code=500)
+        return Response(status_code=200)
+
+    @app.post("/v1/checkpoints/{checkpoint_name}/files")
+    async def register_files(checkpoint_name: str, req: RegisterRequest, raw: Request) -> Response:
+        return wrap_exception(lambda: ps.register_checkpoint(checkpoint_name, files=req.files))
+
+    @app.delete("/v1/checkpoints/{checkpoint_name}")
+    async def unregister_checkpoint(checkpoint_name: str) -> Response:
+        return wrap_exception(lambda: ps.unregister_checkpoint(checkpoint_name))
+
+    @app.get("/v1/healthz")
+    async def healthz() -> Response:
+        return Response(status_code=200)
+
+    @app.post("/v1/checkpoints/{checkpoint_name}/gather-metas")
+    async def gather_metas(checkpoint_name: str) -> Response:
+        return wrap_exception(lambda: ps.gather_metas(checkpoint_name))
+
+    @app.post("/v1/checkpoints/{checkpoint_name}/update")
+    async def update(checkpoint_name: str, req: UpdateRequest) -> Response:
+        def update_func(socket_paths: list[tuple[str, str]]):
+            if req.update_url is None:
+                return
+            if req.inference_group_ranks:
+                socket_paths = [socket_paths[i] for i in req.inference_group_ranks]
+            request_inference_to_update(
+                req.update_url, dict(socket_paths), timeout=req.timeout, uds=req.uds
+            )
+
+        return wrap_exception(lambda: ps.update(checkpoint_name, update_func, ranks=req.ranks))
+
+    return app
diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py
@@ -5,16 +5,15 @@
 from collections import defaultdict
 from collections.abc import Callable
 from datetime import timedelta
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
-import httpx
 import torch
 import torch.distributed as dist
 import zmq
 from loguru import logger
-from pydantic import BaseModel
 from torch.multiprocessing.reductions import reduce_tensor
 
+from checkpoint_engine.api import _init_api
 from checkpoint_engine.data_types import (
     BucketRange,
     DataToGather,
@@ -59,37 +58,6 @@ def _get_physical_gpu_id(device_manager: DeviceManager, device_index: int | None
         raise ValueError(f"fail to get physical gpu id {device_index}") from e
 
 
-def request_inference_to_update(
-    url: str,
-    socket_paths: dict[str, str],
-    timeout: float = 300.0,
-    uds: str | None = None,
-):
-    """Send an inference update request to inference server via HTTP or Unix socket.
-
-    Args:
-        url (str): The HTTP URL or request path (e.g., "http://localhost:19730/inference") to send the request to.
-        socket_paths (dict[str, str]): A dictionary containing device uuid and IPC socket paths for updating weights.
-        timeout (float, optional): Request timeout in seconds. Defaults to 300.0.
-        uds (str, optional): Path to a Unix domain socket. If provided, the request
-            will be sent via the Unix socket instead of HTTP. Defaults to None.
-
-    Raises:
-        httpx.HTTPStatusError: If the response contains an HTTP error status.
-        httpx.RequestError: If there was an issue while making the request.
-    """
-    resp = httpx.Client(transport=httpx.HTTPTransport(uds=uds)).post(
-        url,
-        json={
-            "method": "update_weights_from_ipc",
-            "args": [socket_paths],
-            "timeout": timeout,
-        },
-        timeout=timeout,
-    )
-    resp.raise_for_status()
-
-
 def _gen_h2d_buckets(
     global_metas: dict[int, MemoryBufferMetaList],
     bucket_size: int,
@@ -856,63 +824,6 @@ def _update_per_bucket(
             self.device_manager.device_module.empty_cache()
 
 
-def _init_api(ps: ParameterServer) -> Any:
-    import fastapi
-    from fastapi import Request
-    from fastapi.responses import JSONResponse, Response
-
-    app = fastapi.FastAPI()
-
-    class RegisterRequest(BaseModel):
-        files: list[str]
-
-    class UpdateRequest(BaseModel):
-        ranks: list[int] = []
-        update_url: str | None = None
-        inference_group_ranks: list[int] = []
-        timeout: float = 300.0
-        uds: str | None = None
-
-    def wrap_exception(func: Callable[[], None]) -> Response:
-        try:
-            func()
-        except Exception as e:  # noqa: BLE001
-            logger.exception(f"wrap exception {func} failed")
-            return JSONResponse(content=str(e), status_code=500)
-        return Response(status_code=200)
-
-    @app.post("/v1/checkpoints/{checkpoint_name}/files")
-    async def register_files(checkpoint_name: str, req: RegisterRequest, raw: Request) -> Response:
-        return wrap_exception(lambda: ps.register_checkpoint(checkpoint_name, files=req.files))
-
-    @app.delete("/v1/checkpoints/{checkpoint_name}")
-    async def unregister_checkpoint(checkpoint_name: str) -> Response:
-        return wrap_exception(lambda: ps.unregister_checkpoint(checkpoint_name))
-
-    @app.get("/v1/healthz")
-    async def healthz() -> Response:
-        return Response(status_code=200)
-
-    @app.post("/v1/checkpoints/{checkpoint_name}/gather-metas")
-    async def gather_metas(checkpoint_name: str) -> Response:
-        return wrap_exception(lambda: ps.gather_metas(checkpoint_name))
-
-    @app.post("/v1/checkpoints/{checkpoint_name}/update")
-    async def update(checkpoint_name: str, req: UpdateRequest) -> Response:
-        def update_func(socket_paths: list[tuple[str, str]]):
-            if req.update_url is None:
-                return
-            if req.inference_group_ranks:
-                socket_paths = [socket_paths[i] for i in req.inference_group_ranks]
-            request_inference_to_update(
-                req.update_url, dict(socket_paths), timeout=req.timeout, uds=req.uds
-            )
-
-        return wrap_exception(lambda: ps.update(checkpoint_name, update_func, ranks=req.ranks))
-
-    return app
-
-
 @logger.catch(reraise=True)
 def run_from_cli():
     import uvicorn