Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions docs/my-website/docs/providers/rapid_mlx.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';

# Rapid-MLX

Rapid-MLX is an OpenAI-compatible inference server optimized for Apple Silicon (MLX). 2-4x faster than Ollama, with full tool calling, reasoning separation, and prompt caching.

| Property | Details |
|---|---|
| Description | Local LLM inference server for Apple Silicon. [Docs](https://github.com/raullenchai/Rapid-MLX) |
| Provider Route on LiteLLM | `rapid_mlx/` |
| Provider Doc | [Rapid-MLX ↗](https://github.com/raullenchai/Rapid-MLX) |
| Supported Endpoints | `/chat/completions` |

## Quick Start

### Install and start Rapid-MLX

```bash
brew tap raullenchai/rapid-mlx
brew install rapid-mlx
rapid-mlx serve qwen3.5-9b
```

Or install via pip:

```bash
pip install vllm-mlx
rapid-mlx serve qwen3.5-9b
```

## Usage - litellm.completion (calling OpenAI compatible endpoint)

<Tabs>

<TabItem value="sdk" label="SDK">

```python
import litellm

response = litellm.completion(
model="rapid_mlx/default",
messages=[{"role": "user", "content": "Hello!"}],
)
print(response.choices[0].message.content)
```

</TabItem>

<TabItem value="proxy" label="PROXY">

1. Add to config.yaml

```yaml
model_list:
- model_name: my-model
litellm_params:
model: rapid_mlx/default
api_base: http://localhost:8000/v1
```

2. Start the proxy

```bash
$ litellm --config /path/to/config.yaml
```

3. Send a request

```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "my-model",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```

</TabItem>

</Tabs>

## Environment Variables

| Variable | Description | Default |
|----------|-------------|---------|
| `RAPID_MLX_API_KEY` | API key (optional, Rapid-MLX does not require auth by default) | `not-needed` |
| `RAPID_MLX_API_BASE` | Server URL | `http://localhost:8000/v1` |

## Supported Models

Any MLX model served by Rapid-MLX works. Use the model name as loaded by the server. Common choices:

- `rapid_mlx/default` - Whatever model is currently loaded
- `rapid_mlx/qwen3.5-9b` - Best small model for general use
- `rapid_mlx/qwen3.5-35b` - Smart and fast
- `rapid_mlx/qwen3.5-122b` - Frontier-level MoE model

## Features

- **Streaming** - Full SSE streaming support
- **Tool calling** - 17 parser formats (Qwen, Hermes, MiniMax, GLM, etc.)
- **Reasoning separation** - Native support for thinking models (Qwen3, DeepSeek-R1)
- **Prompt caching** - KV cache reuse and DeltaNet state snapshots for fast TTFT
- **Multi-Token Prediction** - Speculative decoding for supported models
1 change: 1 addition & 0 deletions docs/my-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,7 @@ const sidebars = {
"providers/predibase",
"providers/pydantic_ai_agent",
"providers/ragflow",
"providers/rapid_mlx",
"providers/recraft",
"providers/replicate",
{
Expand Down
5 changes: 5 additions & 0 deletions litellm/llms/openai_like/providers.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,5 +101,10 @@
"param_mappings": {
"max_completion_tokens": "max_tokens"
}
},
"rapid_mlx": {
"base_url": "http://localhost:8000/v1",
"api_key_env": "RAPID_MLX_API_KEY",
"default_api_key": "not-needed"
}
Comment on lines +105 to 109
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Missing api_base_envRAPID_MLX_API_BASE env var will never be read

The rapid_mlx entry is missing the "api_base_env" field. Without it, provider.api_base_env is None in dynamic_config.py, so the following branch is never reached:

# dynamic_config.py L68-69
if not resolved_base and provider.api_base_env:
    resolved_base = get_secret_str(provider.api_base_env)

This means the RAPID_MLX_API_BASE environment variable documented in the docs and verified by test_rapid_mlx_custom_api_base will be silently ignored — the provider always falls through to the hardcoded "http://localhost:8000/v1" default. Compare with the publicai entry directly above this block, which includes "api_base_env": "PUBLICAI_API_BASE".

Suggested change
"rapid_mlx": {
"base_url": "http://localhost:8000/v1",
"api_key_env": "RAPID_MLX_API_KEY",
"default_api_key": "not-needed"
}
"rapid_mlx": {
"base_url": "http://localhost:8000/v1",
"api_key_env": "RAPID_MLX_API_KEY",
"api_base_env": "RAPID_MLX_API_BASE",
"default_api_key": "not-needed"
}

}
1 change: 1 addition & 0 deletions litellm/types/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3259,6 +3259,7 @@ class LlmProviders(str, Enum):
LITELLM_AGENT = "litellm_agent"
CURSOR = "cursor"
BEDROCK_MANTLE = "bedrock_mantle"
RAPID_MLX = "rapid_mlx"


# Create a set of all provider values for quick lookup
Expand Down
18 changes: 18 additions & 0 deletions provider_endpoints_support.json
Original file line number Diff line number Diff line change
Expand Up @@ -2569,6 +2569,24 @@
"batches": false,
"rerank": false
}
},
"rapid_mlx": {
"display_name": "Rapid-MLX (`rapid_mlx`)",
"url": "https://docs.litellm.ai/docs/providers/rapid_mlx",
"endpoints": {
"chat_completions": true,
"messages": true,
"responses": true,
"embeddings": false,
"image_generations": false,
"audio_transcriptions": false,
"audio_speech": false,
"moderations": false,
"batches": false,
"rerank": false,
"a2a": true,
"interactions": true
}
Comment on lines +2586 to +2589
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Unsupported a2a and interactions capabilities enabled

"a2a": true and "interactions": true are set for rapid_mlx, but neither the Rapid-MLX documentation, the PR description, nor the rapid_mlx.md provider page mentions support for the A2A (Agent-to-Agent) protocol or the Interactions endpoint.

Contrast with the immediately preceding charity_engine entry (lines 2557–2571), which is also an OpenAI-compatible provider and omits both fields entirely. Unless Rapid-MLX explicitly implements these protocols, advertising them here will route requests to a server that doesn't support them, leading to runtime errors for users who rely on them.

These fields should be removed (or set to false) until A2A/Interactions support is confirmed and documented:

Suggested change
"rerank": false,
"a2a": true,
"interactions": true
}
"batches": false,
"rerank": false

}
},
"endpoints": {
Expand Down
Empty file.
70 changes: 70 additions & 0 deletions tests/test_litellm/llms/rapid_mlx/test_rapid_mlx_completion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from unittest.mock import patch

import litellm


def test_rapid_mlx_provider_routing():
"""Test that rapid_mlx/ prefix routes correctly as an OpenAI-compatible provider."""
with patch(
"litellm.main.openai_chat_completions.completion"
) as mock_completion:
mock_completion.return_value = {}

provider = "rapid_mlx"
model_name = "default"
model = f"{provider}/{model_name}"
messages = [{"role": "user", "content": "Hello!"}]

_ = litellm.completion(
model=model,
messages=messages,
max_tokens=100,
)

mock_completion.assert_called_once()
_, call_kwargs = mock_completion.call_args
assert call_kwargs.get("custom_llm_provider") == provider
assert call_kwargs.get("model") == model_name
assert call_kwargs.get("messages") == messages
assert call_kwargs.get("api_base") == "http://localhost:8000/v1"
assert call_kwargs.get("api_key") == "not-needed"


def test_rapid_mlx_custom_api_base():
"""Test that RAPID_MLX_API_BASE environment variable is respected."""
with patch(
"litellm.main.openai_chat_completions.completion"
) as mock_completion, patch.dict(
"os.environ",
{"RAPID_MLX_API_BASE": "http://192.168.1.100:8000/v1"},
):
mock_completion.return_value = {}

_ = litellm.completion(
model="rapid_mlx/qwen3.5-9b",
messages=[{"role": "user", "content": "test"}],
)

mock_completion.assert_called_once()
_, call_kwargs = mock_completion.call_args
assert call_kwargs.get("api_base") == "http://192.168.1.100:8000/v1"


def test_rapid_mlx_custom_api_key():
"""Test that RAPID_MLX_API_KEY environment variable is respected."""
with patch(
"litellm.main.openai_chat_completions.completion"
) as mock_completion, patch.dict(
"os.environ",
{"RAPID_MLX_API_KEY": "my-secret-key"},
):
mock_completion.return_value = {}

_ = litellm.completion(
model="rapid_mlx/default",
messages=[{"role": "user", "content": "test"}],
)

mock_completion.assert_called_once()
_, call_kwargs = mock_completion.call_args
assert call_kwargs.get("api_key") == "my-secret-key"
Loading