fix: model validation, retrieve endpoint, status/cache UX (v0.3.12)

Your Name · claude · Your Name · commit 4350a100b78c · 2026-03-21T17:40:53.000-07:00
- Return 404 for unknown model names on chat/completions/messages endpoints
- Add GET /v1/models/{model_id} per OpenAI spec
- Status endpoint: "stopped" → "idle" to avoid server-state confusion
- Cache stats: clear message for text-only models
- README: --model flag → &lt;model&gt; positional arg

Found via 30-round new-user simulation testing.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -435,7 +435,7 @@ Vision, audio (STT/TTS), video understanding, and text embeddings — all throug
 
 | Flag | Description | Default |
 |------|-------------|---------|
-| `--model` | HuggingFace model name or local path | *(required)* |
+| `<model>` | HuggingFace model name, local path, or alias (positional arg) | *(required)* |
 | `--host` | Host to bind to | `0.0.0.0` |
 | `--port` | Port to bind to | `8000` |
 | `--max-tokens` | Default max tokens for generation | `32768` |
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rapid-mlx"
-version = "0.3.11"
+version = "0.3.12"
 description = "Rapid-MLX — AI inference for Apple Silicon. Drop-in OpenAI API, 2-4x faster than Ollama."
 readme = "README.md"
 license = {text = "Apache-2.0"}
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
@@ -948,7 +948,7 @@ async def status():
     stats = _engine.get_stats()
 
     return {
-        "status": "running" if stats.get("running") else "stopped",
+        "status": "generating" if stats.get("running") else "idle",
         "model": _model_name,
         "uptime_s": round(stats.get("uptime_seconds", 0), 1),
         "steps_executed": stats.get("steps_executed", 0),
@@ -985,7 +985,11 @@ async def cache_stats():
             "pil_image_cache": get_pil_cache_stats(),
         }
     except ImportError:
-        return {"error": "Cache stats not available (mlx_vlm not loaded)"}
+        return {
+            "message": "Vision cache stats not available (text-only model loaded). "
+            "Prompt cache is managed internally by the engine.",
+            "model_type": "llm",
+        }
 
 
 @app.delete("/v1/cache")
@@ -1019,6 +1023,14 @@ async def list_models() -> ModelsResponse:
     return ModelsResponse(data=models)
 
 
+@app.get("/v1/models/{model_id}", dependencies=[Depends(verify_api_key)])
+async def retrieve_model(model_id: str) -> ModelInfo:
+    """Retrieve a specific model by ID (OpenAI-compatible)."""
+    if model_id in (_model_name, _model_alias):
+        return ModelInfo(id=model_id)
+    raise HTTPException(status_code=404, detail=f"Model '{model_id}' not found")
+
+
 # =============================================================================
 # Embeddings Endpoint
 # =============================================================================
@@ -1570,6 +1582,13 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     """Create a text completion."""
     engine = get_engine()
 
+    # Validate model name matches loaded model
+    if request.model and request.model not in (_model_name, _model_alias):
+        raise HTTPException(
+            status_code=404,
+            detail=f"Model '{request.model}' not found. Available: {', '.join(filter(None, [_model_alias, _model_name]))}",
+        )
+
     # Handle single prompt or list of prompts
     prompts = request.prompt if isinstance(request.prompt, list) else [request.prompt]
 
@@ -1701,6 +1720,13 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
             detail="messages must not be empty",
         )
 
+    # Validate model name matches loaded model
+    if request.model and request.model not in (_model_name, _model_alias):
+        raise HTTPException(
+            status_code=404,
+            detail=f"Model '{request.model}' not found. Available: {', '.join(filter(None, [_model_alias, _model_name]))}",
+        )
+
     # Validate message roles
     _valid_roles = {"system", "user", "assistant", "tool", "developer"}
     for msg in request.messages:
@@ -2242,6 +2268,16 @@ async def create_anthropic_message(
     )
     logger.debug(f"[REQUEST] last user message preview: {last_user_preview!r}")
 
+    # Validate model name matches loaded model
+    if anthropic_request.model and anthropic_request.model not in (
+        _model_name,
+        _model_alias,
+    ):
+        raise HTTPException(
+            status_code=404,
+            detail=f"Model '{anthropic_request.model}' not found. Available: {', '.join(filter(None, [_model_alias, _model_name]))}",
+        )
+
     # Convert Anthropic request -> OpenAI request
     openai_request = anthropic_to_openai(anthropic_request)