lablup
diff --git a/‎changes/11344.feature.md‎
Lines changed: 1 addition & 1 deletion b/‎changes/11344.feature.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ai/backend/client/cli/v2/deployment/chat.py‎
Lines changed: 104 additions & 63 deletions b/‎src/ai/backend/client/cli/v2/deployment/chat.py‎
Lines changed: 104 additions & 63 deletions
@@ -1 +1 @@
-Add `./bai deployment chat` and `./bai deployment chat-config` v2 CLI commands for one-shot OpenAI-compatible chat with deployed vLLM models, including a local cache (`~/.backend.ai/deployment_chat.json`, `0600`) of per-deployment endpoint URLs and API keys.
+Add `./bai deployment chat` for one-shot OpenAI-compatible chat against deployed inference services.
@@ -3,10 +3,9 @@
 from __future__ import annotations
 
 import asyncio
-import sys
 from collections.abc import Awaitable, Callable
 from datetime import UTC, datetime
-from typing import TYPE_CHECKING, Any, NoReturn
+from typing import Any
 from uuid import UUID
 
 import click
@@ -18,15 +17,7 @@
     load_chat_cache,
     save_chat_cache,
 )
-from ai.backend.client.cli.v2.helpers import create_v2_registry, load_v2_config, print_result
-
-if TYPE_CHECKING:
-    from ai.backend.client.v2.v2_registry import V2ClientRegistry
-
-
-def _abort(message: str) -> NoReturn:
-    click.echo(message, err=True)
-    sys.exit(1)
+from ai.backend.client.cli.v2.helpers import create_v2_registry, load_v2_config
 
 
 def _run_async(coro_fn: Callable[[], Awaitable[None]]) -> None:
@@ -40,19 +31,7 @@ def _run_async(coro_fn: Callable[[], Awaitable[None]]) -> None:
         msg = data.get("msg", "") if isinstance(data, dict) else ""
         status = e.args[0] if e.args else "?"
         detail = title or msg or str(e)
-        click.echo(f"Error ({status}): {detail}", err=True)
-        sys.exit(1)
-
-
-async def _resolve_endpoint_url(registry: V2ClientRegistry, deployment_id: UUID) -> str:
-    deployment = await registry.deployment.get(deployment_id)
-    endpoint_url = deployment.network_access.endpoint_url
-    if not endpoint_url:
-        raise click.ClickException(
-            f"Deployment {deployment_id} has no endpoint_url yet "
-            "(it may still be provisioning). Wait until the deployment is ACTIVE."
-        )
-    return endpoint_url
+        raise click.ClickException(f"{status}: {detail}") from e
 
 
 @click.command(name="chat")
@@ -62,106 +41,168 @@ async def _resolve_endpoint_url(registry: V2ClientRegistry, deployment_id: UUID)
     "--model",
     default=None,
     type=str,
-    help="Model name to send (defaults to cached default_model or 'default').",
+    help="Model name to send (defaults to cached default_model).",
 )
 @click.option(
     "--temperature",
     default=None,
-    type=float,
+    type=click.FloatRange(min=0.0, max=2.0),
     help="Sampling temperature.",
 )
 @click.option(
-    "--max-tokens",
+    "--top-p",
+    default=None,
+    type=click.FloatRange(min=0.0, max=1.0),
+    help="Nucleus sampling probability mass.",
+)
+@click.option(
+    "--frequency-penalty",
+    default=None,
+    type=click.FloatRange(min=-2.0, max=2.0),
+    help="Penalty for token frequency.",
+)
+@click.option(
+    "--presence-penalty",
+    default=None,
+    type=click.FloatRange(min=-2.0, max=2.0),
+    help="Penalty for token presence.",
+)
+@click.option(
+    "--seed",
     default=None,
     type=int,
+    help="Random seed for deterministic sampling.",
+)
+@click.option(
+    "--stop",
+    multiple=True,
+    type=str,
+    help="Stop sequence (repeatable).",
+)
+@click.option(
+    "--max-tokens",
+    default=None,
+    type=click.IntRange(min=1),
     help="Maximum number of tokens to generate.",
 )
 def chat(
     deployment_id: UUID,
     content: str,
     model: str | None,
     temperature: float | None,
+    top_p: float | None,
+    frequency_penalty: float | None,
+    presence_penalty: float | None,
+    seed: int | None,
+    stop: tuple[str, ...],
     max_tokens: int | None,
 ) -> None:
-    """Send a one-shot chat completion request to a deployed vLLM model."""
-    from ai.backend.client.exceptions import BackendAPIError
-    from ai.backend.client.v2.chat_dto import ChatCompletionRequest, ChatMessage
-    from ai.backend.client.v2.domains_v2.deployment_chat import (
-        DeploymentChatAuthError,
-        DeploymentChatClient,
+    """Send a one-shot chat completion request to a deployed model."""
+    import json
+    import sys
+
+    from ai.backend.client.v2.domains_v2.inference_chat import (
+        InferenceChatAuthError,
+        InferenceChatClient,
     )
 
     config = load_v2_config()
 
     try:
         cache = load_chat_cache()
     except IncompatibleChatCacheError as e:
-        _abort(str(e))
+        raise click.ClickException(str(e)) from e
 
     entry = cache.get(deployment_id)
 
-    async def _ensure_endpoint() -> DeploymentChatCacheEntry:
+    async def _resolve_endpoint() -> DeploymentChatCacheEntry:
         if entry is not None and entry.endpoint_url:
             return entry
         registry = await create_v2_registry(config)
         try:
-            endpoint_url = await _resolve_endpoint_url(registry, deployment_id)
+            deployment = await registry.deployment.get(deployment_id)
         finally:
             await registry.close()
+        endpoint_url = deployment.network_access.endpoint_url
+        if not endpoint_url:
+            raise click.ClickException(
+                f"Deployment {deployment_id} has no endpoint_url yet "
+                "(it may still be provisioning). Wait until the deployment is READY."
+            )
         new_entry = DeploymentChatCacheEntry(
             endpoint_url=endpoint_url,
-            vllm_api_key=entry.vllm_api_key if entry is not None else None,
+            api_key=entry.api_key if entry is not None else None,
             default_model=entry.default_model if entry is not None else None,
             last_synced_at=datetime.now(UTC),
         )
         cache.upsert(deployment_id, new_entry)
         save_chat_cache(cache)
         return new_entry
 
+    def _build_request_body(model_name: str) -> dict[str, Any]:
+        body: dict[str, Any] = {
+            "model": model_name,
+            "messages": [{"role": "user", "content": content}],
+        }
+        if temperature is not None:
+            body["temperature"] = temperature
+        if top_p is not None:
+            body["top_p"] = top_p
+        if frequency_penalty is not None:
+            body["frequency_penalty"] = frequency_penalty
+        if presence_penalty is not None:
+            body["presence_penalty"] = presence_penalty
+        if seed is not None:
+            body["seed"] = seed
+        if stop:
+            body["stop"] = list(stop)
+        if max_tokens is not None:
+            body["max_tokens"] = max_tokens
+        return body
+
     async def _run() -> None:
-        resolved = await _ensure_endpoint()
-        if resolved.vllm_api_key is None:
-            _abort(
-                f"No vLLM API key registered for deployment {deployment_id}.\n"
-                "Register one with:\n"
-                f"  ./bai deployment chat-config set {deployment_id} --token <vllm_api_key>"
+        from ai.backend.client.exceptions import BackendAPIError
+
+        resolved = await _resolve_endpoint()
+        request_model = model or resolved.default_model
+        if request_model is None:
+            raise click.ClickException(
+                f"No --model given and no default_model cached for deployment {deployment_id}.\n"
+                "Set one with:\n"
+                f"  ./bai deployment chat-config set {deployment_id} --token <api_key>\n"
+                "(this auto-discovers the served model from the inference endpoint)."
             )
 
-        request_model = model or resolved.default_model or "default"
-        chat_request = ChatCompletionRequest(
-            model=request_model,
-            messages=[ChatMessage(role="user", content=content)],
-            temperature=temperature,
-            max_tokens=max_tokens,
-        )
-
-        async with DeploymentChatClient(
+        body = _build_request_body(request_model)
+        async with InferenceChatClient(
             skip_ssl_verification=config.skip_ssl_verification,
-        ) as chat_client:
+        ) as client:
             try:
-                response = await chat_client.chat_completion(
+                response = await client.chat_completion(
                     resolved.endpoint_url,
-                    resolved.vllm_api_key,
-                    chat_request,
+                    resolved.api_key,
+                    body,
                 )
-            except DeploymentChatAuthError:
+            except InferenceChatAuthError as e:
                 invalidated = DeploymentChatCacheEntry(
                     endpoint_url=resolved.endpoint_url,
-                    vllm_api_key=None,
+                    api_key=None,
                     default_model=resolved.default_model,
                     last_synced_at=datetime.now(UTC),
                 )
                 cache.upsert(deployment_id, invalidated)
                 save_chat_cache(cache)
-                _abort(
+                raise click.ClickException(
                     f"The inference endpoint rejected the configured API key for "
                     f"deployment {deployment_id}. The cached key has been cleared.\n"
                     "Register a new one with:\n"
-                    f"  ./bai deployment chat-config set {deployment_id} --token <vllm_api_key>"
-                )
+                    f"  ./bai deployment chat-config set {deployment_id} --token <api_key>"
+                ) from e
             except BackendAPIError as e:
-                _abort(f"Inference endpoint error ({e.status} {e.reason}): {e.data}")
-        print_result(response)
+                raise click.ClickException(
+                    f"Inference endpoint error ({e.status} {e.reason}): {e.data}"
+                ) from e
+        sys.stdout.write(json.dumps(response, indent=2, ensure_ascii=False, default=str) + "\n")
 
     _run_async(_run)
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-Add `./bai deployment chat` and `./bai deployment chat-config` v2 CLI commands for one-shot OpenAI-compatible chat with deployed vLLM models, including a local cache (`~/.backend.ai/deployment_chat.json`, `0600`) of per-deployment endpoint URLs and API keys.
	`1`	+Add `./bai deployment chat` for one-shot OpenAI-compatible chat against deployed inference services.