Skip to content

Commit 4350a10

Browse files
Your Nameclaude
andcommitted
fix: model validation, retrieve endpoint, status/cache UX (v0.3.12)
- Return 404 for unknown model names on chat/completions/messages endpoints - Add GET /v1/models/{model_id} per OpenAI spec - Status endpoint: "stopped" → "idle" to avoid server-state confusion - Cache stats: clear message for text-only models - README: --model flag → <model> positional arg Found via 30-round new-user simulation testing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6d80003 commit 4350a10

File tree

3 files changed

+40
-4
lines changed

3 files changed

+40
-4
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ Vision, audio (STT/TTS), video understanding, and text embeddings — all throug
435435

436436
| Flag | Description | Default |
437437
|------|-------------|---------|
438-
| `--model` | HuggingFace model name or local path | *(required)* |
438+
| `<model>` | HuggingFace model name, local path, or alias (positional arg) | *(required)* |
439439
| `--host` | Host to bind to | `0.0.0.0` |
440440
| `--port` | Port to bind to | `8000` |
441441
| `--max-tokens` | Default max tokens for generation | `32768` |

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "rapid-mlx"
7-
version = "0.3.11"
7+
version = "0.3.12"
88
description = "Rapid-MLX — AI inference for Apple Silicon. Drop-in OpenAI API, 2-4x faster than Ollama."
99
readme = "README.md"
1010
license = {text = "Apache-2.0"}

vllm_mlx/server.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -948,7 +948,7 @@ async def status():
948948
stats = _engine.get_stats()
949949

950950
return {
951-
"status": "running" if stats.get("running") else "stopped",
951+
"status": "generating" if stats.get("running") else "idle",
952952
"model": _model_name,
953953
"uptime_s": round(stats.get("uptime_seconds", 0), 1),
954954
"steps_executed": stats.get("steps_executed", 0),
@@ -985,7 +985,11 @@ async def cache_stats():
985985
"pil_image_cache": get_pil_cache_stats(),
986986
}
987987
except ImportError:
988-
return {"error": "Cache stats not available (mlx_vlm not loaded)"}
988+
return {
989+
"message": "Vision cache stats not available (text-only model loaded). "
990+
"Prompt cache is managed internally by the engine.",
991+
"model_type": "llm",
992+
}
989993

990994

991995
@app.delete("/v1/cache")
@@ -1019,6 +1023,14 @@ async def list_models() -> ModelsResponse:
10191023
return ModelsResponse(data=models)
10201024

10211025

1026+
@app.get("/v1/models/{model_id}", dependencies=[Depends(verify_api_key)])
1027+
async def retrieve_model(model_id: str) -> ModelInfo:
1028+
"""Retrieve a specific model by ID (OpenAI-compatible)."""
1029+
if model_id in (_model_name, _model_alias):
1030+
return ModelInfo(id=model_id)
1031+
raise HTTPException(status_code=404, detail=f"Model '{model_id}' not found")
1032+
1033+
10221034
# =============================================================================
10231035
# Embeddings Endpoint
10241036
# =============================================================================
@@ -1570,6 +1582,13 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
15701582
"""Create a text completion."""
15711583
engine = get_engine()
15721584

1585+
# Validate model name matches loaded model
1586+
if request.model and request.model not in (_model_name, _model_alias):
1587+
raise HTTPException(
1588+
status_code=404,
1589+
detail=f"Model '{request.model}' not found. Available: {', '.join(filter(None, [_model_alias, _model_name]))}",
1590+
)
1591+
15731592
# Handle single prompt or list of prompts
15741593
prompts = request.prompt if isinstance(request.prompt, list) else [request.prompt]
15751594

@@ -1701,6 +1720,13 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
17011720
detail="messages must not be empty",
17021721
)
17031722

1723+
# Validate model name matches loaded model
1724+
if request.model and request.model not in (_model_name, _model_alias):
1725+
raise HTTPException(
1726+
status_code=404,
1727+
detail=f"Model '{request.model}' not found. Available: {', '.join(filter(None, [_model_alias, _model_name]))}",
1728+
)
1729+
17041730
# Validate message roles
17051731
_valid_roles = {"system", "user", "assistant", "tool", "developer"}
17061732
for msg in request.messages:
@@ -2242,6 +2268,16 @@ async def create_anthropic_message(
22422268
)
22432269
logger.debug(f"[REQUEST] last user message preview: {last_user_preview!r}")
22442270

2271+
# Validate model name matches loaded model
2272+
if anthropic_request.model and anthropic_request.model not in (
2273+
_model_name,
2274+
_model_alias,
2275+
):
2276+
raise HTTPException(
2277+
status_code=404,
2278+
detail=f"Model '{anthropic_request.model}' not found. Available: {', '.join(filter(None, [_model_alias, _model_name]))}",
2279+
)
2280+
22452281
# Convert Anthropic request -> OpenAI request
22462282
openai_request = anthropic_to_openai(anthropic_request)
22472283

0 commit comments

Comments
 (0)