refactor(BA-5528): split chat cache token, switch entry to BaseModel, fix module placement

jopemachine · jopemachine · commit de5be3d6cde9 · 2026-04-27T18:15:03.000+09:00
- DeploymentChatCacheEntry no longer carries api_key. Tokens now live
  in a separate top-level 'tokens' map in the same JSON cache file,
  surfaced via DeploymentChatCache.get_token / set_token / clear_token.
  remove() and chat-config clear/show treat them as one logical record.
- Convert DeploymentChatCacheEntry from a dataclass to a Pydantic
  BaseModel (frozen) so model_dump / model_validate replace the manual
  to_dict / from_dict helpers.
- Rename _resolve_endpoint to _ensure_endpoint_entry in chat.py — the
  function returns the full cache entry (endpoint + default_model +
  last_synced_at), not just the URL, so the previous name was misleading.
- Move the SDK chat client out of domains_v2/. It does not inherit
  BaseDomainClient nor call typed_request (it talks directly to the
  inference container, not the manager) so it does not belong with the
  REST domain clients. New location: client/v2/deployment_chat.py.
- Drop the InferenceChat* rename and keep DeploymentChat* to align with
  the './bai deployment chat' CLI command name.
diff --git a/src/ai/backend/client/cli/v2/deployment/chat.py b/src/ai/backend/client/cli/v2/deployment/chat.py
@@ -70,9 +70,9 @@ def chat(
     import json
     import sys
 
-    from ai.backend.client.v2.domains_v2.inference_chat import (
-        InferenceChatAuthError,
-        InferenceChatClient,
+    from ai.backend.client.v2.deployment_chat import (
+        DeploymentChatAuthError,
+        DeploymentChatClient,
     )
 
     config = load_v2_config()
@@ -87,7 +87,7 @@ def chat(
     extra_body: dict[str, Any] = params
     entry = cache.get(deployment_id)
 
-    async def _resolve_endpoint() -> DeploymentChatCacheEntry:
+    async def _ensure_endpoint_entry() -> DeploymentChatCacheEntry:
         if entry is not None and entry.endpoint_url:
             return entry
         registry = await create_v2_registry(config)
@@ -103,7 +103,6 @@ async def _resolve_endpoint() -> DeploymentChatCacheEntry:
             )
         new_entry = DeploymentChatCacheEntry(
             endpoint_url=endpoint_url,
-            api_key=entry.api_key if entry is not None else None,
             default_model=entry.default_model if entry is not None else None,
             last_synced_at=datetime.now(UTC),
         )
@@ -114,8 +113,8 @@ async def _resolve_endpoint() -> DeploymentChatCacheEntry:
     async def _run() -> None:
         from ai.backend.client.exceptions import BackendAPIError
 
-        resolved = await _resolve_endpoint()
-        request_model = model or resolved.default_model
+        endpoint_entry = await _ensure_endpoint_entry()
+        request_model = model or endpoint_entry.default_model
         if request_model is None:
             raise click.ClickException(
                 f"No --model given and no default_model cached for deployment {deployment_id}.\n"
@@ -129,23 +128,18 @@ async def _run() -> None:
             "model": request_model,
             "messages": [{"role": "user", "content": content}],
         }
-        async with InferenceChatClient(
+        api_key = cache.get_token(deployment_id)
+        async with DeploymentChatClient(
             skip_ssl_verification=config.skip_ssl_verification,
         ) as client:
             try:
                 response = await client.chat_completion(
-                    resolved.endpoint_url,
-                    resolved.api_key,
+                    endpoint_entry.endpoint_url,
+                    api_key,
                     body,
                 )
-            except InferenceChatAuthError as e:
-                invalidated = DeploymentChatCacheEntry(
-                    endpoint_url=resolved.endpoint_url,
-                    api_key=None,
-                    default_model=resolved.default_model,
-                    last_synced_at=datetime.now(UTC),
-                )
-                cache.upsert(deployment_id, invalidated)
+            except DeploymentChatAuthError as e:
+                cache.clear_token(deployment_id)
                 save_chat_cache(cache)
                 raise click.ClickException(
                     f"The inference endpoint rejected the configured API key for "
diff --git a/src/ai/backend/client/cli/v2/deployment/chat_config.py b/src/ai/backend/client/cli/v2/deployment/chat_config.py
@@ -94,7 +94,7 @@ def set_(
     elif api_key is not None:
         resolved_key = api_key
     else:
-        resolved_key = existing.api_key if existing is not None else None
+        resolved_key = cache.get_token(deployment_id)
 
     async def _run() -> None:
         registry = await create_v2_registry(config)
@@ -123,11 +123,14 @@ async def _run() -> None:
             deployment_id,
             DeploymentChatCacheEntry(
                 endpoint_url=endpoint_url,
-                api_key=resolved_key,
                 default_model=served_model,
                 last_synced_at=datetime.now(UTC),
             ),
         )
+        if resolved_key is None:
+            cache.clear_token(deployment_id)
+        else:
+            cache.set_token(deployment_id, resolved_key)
         save_chat_cache(cache)
         click.echo(f"Updated chat cache entry for deployment {deployment_id}.")
         if served_model:
@@ -150,15 +153,15 @@ async def _discover_model(
     does not contain any model entries.
     """
     from ai.backend.client.exceptions import BackendAPIError, BackendClientError
-    from ai.backend.client.v2.domains_v2.inference_chat import (
-        InferenceChatAuthError,
-        InferenceChatClient,
+    from ai.backend.client.v2.deployment_chat import (
+        DeploymentChatAuthError,
+        DeploymentChatClient,
     )
 
-    async with InferenceChatClient(skip_ssl_verification=skip_ssl_verification) as client:
+    async with DeploymentChatClient(skip_ssl_verification=skip_ssl_verification) as client:
         try:
             payload = await client.list_models(endpoint_url, api_key)
-        except (InferenceChatAuthError, BackendAPIError, BackendClientError):
+        except (DeploymentChatAuthError, BackendAPIError, BackendClientError):
             return fallback
     data = payload.get("data") if isinstance(payload, dict) else None
     if not isinstance(data, list):
@@ -180,23 +183,25 @@ def show(deployment_id: UUID | None) -> None:
 
     if deployment_id is not None:
         entry = cache.get(deployment_id)
-        if entry is None:
+        token = cache.get_token(deployment_id)
+        if entry is None and token is None:
             raise click.ClickException(f"No chat cache entry for deployment {deployment_id}.")
-        _print_entry(deployment_id, entry)
+        _print_entry(deployment_id, entry, token)
         return
 
-    if not cache.entries:
+    dep_ids = set(cache.entries) | set(cache.tokens)
+    if not dep_ids:
         click.echo("No chat cache entries.")
         return
-    for dep_id, entry in cache.entries.items():
-        _print_entry(dep_id, entry)
+    for dep_id in dep_ids:
+        _print_entry(dep_id, cache.get(dep_id), cache.get_token(dep_id))
         click.echo("")
 
 
 @chat_config.command(name="clear")
 @click.argument("deployment_id", type=click.UUID)
 def clear(deployment_id: UUID) -> None:
-    """Remove the chat cache entry for a deployment."""
+    """Remove the chat cache entry and stored token for a deployment."""
     try:
         cache = load_chat_cache()
     except IncompatibleChatCacheError as e:
@@ -208,12 +213,16 @@ def clear(deployment_id: UUID) -> None:
         click.echo(f"No chat cache entry for deployment {deployment_id}.")
 
 
-def _print_entry(deployment_id: UUID, entry: DeploymentChatCacheEntry) -> None:
+def _print_entry(
+    deployment_id: UUID,
+    entry: DeploymentChatCacheEntry | None,
+    token: str | None,
+) -> None:
     click.echo(f"deployment_id : {deployment_id}")
-    click.echo(f"endpoint_url  : {entry.endpoint_url}")
-    click.echo(f"api_key       : {mask_token(entry.api_key)}")
-    click.echo(f"default_model : {entry.default_model or '-'}")
-    click.echo(f"last_synced_at: {entry.last_synced_at.isoformat()}")
+    click.echo(f"endpoint_url  : {entry.endpoint_url if entry else '-'}")
+    click.echo(f"api_key       : {mask_token(token)}")
+    click.echo(f"default_model : {(entry.default_model if entry else None) or '-'}")
+    click.echo(f"last_synced_at: {entry.last_synced_at.isoformat() if entry else '-'}")
 
 
 __all__ = ("chat_config",)
diff --git a/src/ai/backend/client/cli/v2/deployment_chat_cache.py b/src/ai/backend/client/cli/v2/deployment_chat_cache.py
@@ -1,11 +1,13 @@
 """Local cache for ``./bai deployment chat`` per-deployment settings.
 
-Stores ``endpoint_url`` (resolved from the manager) and the inference API
-key the user registered for each deployment so that follow-up ``chat``
-invocations do not need to re-query the manager nor re-prompt for the key.
-
-Persisted as a single JSON file at ``~/.backend.ai/deployment_chat.json``
-with ``0600`` file permissions because the API key is stored in plaintext.
+Persists the manager-resolved ``endpoint_url`` and the served model name
+discovered from the inference endpoint, plus a separate map of API keys
+the user registered through ``./bai deployment chat-config set``. The
+endpoint entry is auto-managed (refetched when missing); the token is
+user-supplied and never auto-discovered.
+
+Stored as a single JSON file at ``~/.backend.ai/deployment_chat.json``
+with ``0600`` permissions because the API keys are kept in plaintext.
 """
 
 from __future__ import annotations
@@ -15,56 +17,40 @@
 import stat
 import tempfile
 from dataclasses import dataclass, field
-from datetime import UTC, datetime
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 from uuid import UUID
 
+from pydantic import BaseModel, ConfigDict, ValidationError
+
 from ai.backend.client.cli.v2.helpers import CONFIG_DIR
 
 CHAT_CACHE_FILE = CONFIG_DIR / "deployment_chat.json"
 CHAT_CACHE_SCHEMA_VERSION = 1
 
 
-@dataclass(frozen=True)
-class DeploymentChatCacheEntry:
-    """One deployment's chat configuration."""
+class DeploymentChatCacheEntry(BaseModel):
+    """One deployment's auto-managed endpoint metadata."""
+
+    model_config = ConfigDict(frozen=True)
 
     endpoint_url: str
-    api_key: str | None
-    default_model: str | None
+    default_model: str | None = None
     last_synced_at: datetime
 
-    def to_dict(self) -> dict[str, Any]:
-        return {
-            "endpoint_url": self.endpoint_url,
-            "api_key": self.api_key,
-            "default_model": self.default_model,
-            "last_synced_at": self.last_synced_at.isoformat(),
-        }
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> DeploymentChatCacheEntry:
-        synced_raw = data.get("last_synced_at")
-        if isinstance(synced_raw, str):
-            synced = datetime.fromisoformat(synced_raw)
-        else:
-            synced = datetime.now(UTC)
-        return cls(
-            endpoint_url=str(data["endpoint_url"]),
-            api_key=(str(data["api_key"]) if data.get("api_key") is not None else None),
-            default_model=(
-                str(data["default_model"]) if data.get("default_model") is not None else None
-            ),
-            last_synced_at=synced,
-        )
-
 
 @dataclass
 class DeploymentChatCache:
-    """In-memory representation of the chat cache file."""
+    """In-memory representation of the chat cache file.
+
+    ``entries`` is the auto-managed endpoint cache; ``tokens`` is the
+    user-managed API-key store. They are kept in the same file under
+    distinct top-level keys.
+    """
 
     entries: dict[UUID, DeploymentChatCacheEntry] = field(default_factory=dict)
+    tokens: dict[UUID, str] = field(default_factory=dict)
 
     def get(self, deployment_id: UUID) -> DeploymentChatCacheEntry | None:
         return self.entries.get(deployment_id)
@@ -73,12 +59,26 @@ def upsert(self, deployment_id: UUID, entry: DeploymentChatCacheEntry) -> None:
         self.entries[deployment_id] = entry
 
     def remove(self, deployment_id: UUID) -> bool:
-        return self.entries.pop(deployment_id, None) is not None
+        had_entry = self.entries.pop(deployment_id, None) is not None
+        had_token = self.tokens.pop(deployment_id, None) is not None
+        return had_entry or had_token
+
+    def get_token(self, deployment_id: UUID) -> str | None:
+        return self.tokens.get(deployment_id)
+
+    def set_token(self, deployment_id: UUID, token: str) -> None:
+        self.tokens[deployment_id] = token
+
+    def clear_token(self, deployment_id: UUID) -> bool:
+        return self.tokens.pop(deployment_id, None) is not None
 
     def to_dict(self) -> dict[str, Any]:
         return {
             "schema_version": CHAT_CACHE_SCHEMA_VERSION,
-            "deployments": {str(dep_id): entry.to_dict() for dep_id, entry in self.entries.items()},
+            "deployments": {
+                str(dep_id): entry.model_dump(mode="json") for dep_id, entry in self.entries.items()
+            },
+            "tokens": {str(dep_id): token for dep_id, token in self.tokens.items()},
         }
 
 
@@ -109,8 +109,8 @@ def load_chat_cache(path: Path = CHAT_CACHE_FILE) -> DeploymentChatCache:
             f"deployment_chat.json schema version {schema} is newer than supported "
             f"{CHAT_CACHE_SCHEMA_VERSION}; please upgrade the client."
         )
-    deployments_raw = raw.get("deployments") or {}
     entries: dict[UUID, DeploymentChatCacheEntry] = {}
+    deployments_raw = raw.get("deployments") or {}
     if isinstance(deployments_raw, dict):
         for key, value in deployments_raw.items():
             try:
@@ -120,10 +120,20 @@ def load_chat_cache(path: Path = CHAT_CACHE_FILE) -> DeploymentChatCache:
             if not isinstance(value, dict):
                 continue
             try:
-                entries[dep_id] = DeploymentChatCacheEntry.from_dict(value)
-            except (KeyError, ValueError, TypeError):
+                entries[dep_id] = DeploymentChatCacheEntry.model_validate(value)
+            except ValidationError:
+                continue
+    tokens: dict[UUID, str] = {}
+    tokens_raw = raw.get("tokens") or {}
+    if isinstance(tokens_raw, dict):
+        for key, value in tokens_raw.items():
+            try:
+                dep_id = UUID(str(key))
+            except ValueError:
                 continue
-    return DeploymentChatCache(entries=entries)
+            if isinstance(value, str):
+                tokens[dep_id] = value
+    return DeploymentChatCache(entries=entries, tokens=tokens)
 
 
 def save_chat_cache(cache: DeploymentChatCache, path: Path = CHAT_CACHE_FILE) -> None:
diff --git a/src/ai/backend/client/v2/deployment_chat.py b/src/ai/backend/client/v2/deployment_chat.py
@@ -24,11 +24,11 @@
 DEFAULT_MODELS_PATH = "/v1/models"
 
 
-class InferenceChatAuthError(BackendAPIError):
+class DeploymentChatAuthError(BackendAPIError):
     """Raised when the inference endpoint rejects the configured API key."""
 
 
-class InferenceChatClient:
+class DeploymentChatClient:
     """Direct HTTP client for OpenAI-compatible inference endpoints."""
 
     _session: aiohttp.ClientSession
@@ -145,7 +145,7 @@ def _raise_for_status(resp: aiohttp.ClientResponse, payload: object) -> None:
             return
         data = payload if isinstance(payload, dict) else {"detail": payload}
         if resp.status in (401, 403):
-            raise InferenceChatAuthError(resp.status, resp.reason or "Unauthorized", data)
+            raise DeploymentChatAuthError(resp.status, resp.reason or "Unauthorized", data)
         raise BackendAPIError(resp.status, resp.reason or "HTTP error", data)
 
     @staticmethod
diff --git a/tests/unit/client/cli/test_deployment_chat_cache.py b/tests/unit/client/cli/test_deployment_chat_cache.py
diff --git a/tests/unit/client/v2/test_deployment_chat_client.py b/tests/unit/client/v2/test_deployment_chat_client.py