BerriAI · raullenchai · Mar 21, 2026 · greptile-apps · Mar 21, 2026 · greptile-apps
diff --git a/docs/my-website/docs/providers/rapid_mlx.md b/docs/my-website/docs/providers/rapid_mlx.md
@@ -0,0 +1,111 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Rapid-MLX
+
+Rapid-MLX is an OpenAI-compatible inference server optimized for Apple Silicon (MLX). 2-4x faster than Ollama, with full tool calling, reasoning separation, and prompt caching.
+
+| Property | Details |
+|---|---|
+| Description | Local LLM inference server for Apple Silicon. [Docs](https://github.com/raullenchai/Rapid-MLX) |
+| Provider Route on LiteLLM | `rapid_mlx/` |
+| Provider Doc | [Rapid-MLX ↗](https://github.com/raullenchai/Rapid-MLX) |
+| Supported Endpoints | `/chat/completions` |
+
+## Quick Start
+
+### Install and start Rapid-MLX
+
+```bash
+brew tap raullenchai/rapid-mlx
+brew install rapid-mlx
+rapid-mlx serve qwen3.5-9b
+```
+
+Or install via pip:
+
+```bash
+pip install vllm-mlx
+rapid-mlx serve qwen3.5-9b
+```
+
+## Usage - litellm.completion (calling OpenAI compatible endpoint)
+
+<Tabs>
+
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm
+
+response = litellm.completion(
+    model="rapid_mlx/default",
+    messages=[{"role": "user", "content": "Hello!"}],
+)
+print(response.choices[0].message.content)
+```
+
+</TabItem>
+
+<TabItem value="proxy" label="PROXY">
+
+1. Add to config.yaml
+
+```yaml
+model_list:
+  - model_name: my-model
+    litellm_params:
+      model: rapid_mlx/default
+      api_base: http://localhost:8000/v1
+```
+
+2. Start the proxy
+
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
+3. Send a request
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "my-model",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ]
+}'
+```
+
+</TabItem>
+
+</Tabs>
+
+## Environment Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `RAPID_MLX_API_KEY` | API key (optional, Rapid-MLX does not require auth by default) | `not-needed` |
+| `RAPID_MLX_API_BASE` | Server URL | `http://localhost:8000/v1` |
+
+## Supported Models
+
+Any MLX model served by Rapid-MLX works. Use the model name as loaded by the server. Common choices:
+
+- `rapid_mlx/default` - Whatever model is currently loaded
+- `rapid_mlx/qwen3.5-9b` - Best small model for general use
+- `rapid_mlx/qwen3.5-35b` - Smart and fast
+- `rapid_mlx/qwen3.5-122b` - Frontier-level MoE model
+
+## Features
+
+- **Streaming** - Full SSE streaming support
+- **Tool calling** - 17 parser formats (Qwen, Hermes, MiniMax, GLM, etc.)
+- **Reasoning separation** - Native support for thinking models (Qwen3, DeepSeek-R1)
+- **Prompt caching** - KV cache reuse and DeltaNet state snapshots for fast TTFT
+- **Multi-Token Prediction** - Speculative decoding for supported models
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
@@ -971,6 +971,7 @@ const sidebars = {
         "providers/predibase",
         "providers/pydantic_ai_agent",
         "providers/ragflow",
+        "providers/rapid_mlx",
         "providers/recraft",
         "providers/replicate",
         {

diff --git a/litellm/llms/openai_like/providers.json b/litellm/llms/openai_like/providers.json
@@ -101,5 +101,10 @@
     "param_mappings": {
       "max_completion_tokens": "max_tokens"
     }
+  },
+  "rapid_mlx": {
+    "base_url": "http://localhost:8000/v1",
+    "api_key_env": "RAPID_MLX_API_KEY",
+    "default_api_key": "not-needed"
   }
-  "rapid_mlx": {
-    "base_url": "http://localhost:8000/v1",
-    "api_key_env": "RAPID_MLX_API_KEY",
-    "default_api_key": "not-needed"
-  }
+  "rapid_mlx": {
+    "base_url": "http://localhost:8000/v1",
+    "api_key_env": "RAPID_MLX_API_KEY",
+    "api_base_env": "RAPID_MLX_API_BASE",
+    "default_api_key": "not-needed"
+  }
-  "rapid_mlx": {
-    "base_url": "http://localhost:8000/v1",
-    "api_key_env": "RAPID_MLX_API_KEY",
-    "default_api_key": "not-needed"
-  }
+  "rapid_mlx": {
+    "base_url": "http://localhost:8000/v1",
+    "api_key_env": "RAPID_MLX_API_KEY",
+    "api_base_env": "RAPID_MLX_API_BASE",
+    "default_api_key": "not-needed"
+  }
 }
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
@@ -3259,6 +3259,7 @@ class LlmProviders(str, Enum):
     LITELLM_AGENT = "litellm_agent"
     CURSOR = "cursor"
     BEDROCK_MANTLE = "bedrock_mantle"
+    RAPID_MLX = "rapid_mlx"
 
 
 # Create a set of all provider values for quick lookup

diff --git a/provider_endpoints_support.json b/provider_endpoints_support.json
@@ -2569,6 +2569,24 @@
         "batches": false,
         "rerank": false
       }
+    },
+    "rapid_mlx": {
+      "display_name": "Rapid-MLX (`rapid_mlx`)",
+      "url": "https://docs.litellm.ai/docs/providers/rapid_mlx",
+      "endpoints": {
+        "chat_completions": true,
+        "messages": true,
+        "responses": true,
+        "embeddings": false,
+        "image_generations": false,
+        "audio_transcriptions": false,
+        "audio_speech": false,
+        "moderations": false,
+        "batches": false,
+        "rerank": false,
+        "a2a": true,
+        "interactions": true
+      }
-        "rerank": false,
-        "a2a": true,
-        "interactions": true
-      }
+        "batches": false,
+        "rerank": false
-        "rerank": false,
-        "a2a": true,
-        "interactions": true
-      }
+        "batches": false,
+        "rerank": false
     }
   },
   "endpoints": {

diff --git a/tests/test_litellm/llms/rapid_mlx/__init__.py b/tests/test_litellm/llms/rapid_mlx/__init__.py
diff --git a/tests/test_litellm/llms/rapid_mlx/test_rapid_mlx_completion.py b/tests/test_litellm/llms/rapid_mlx/test_rapid_mlx_completion.py
@@ -0,0 +1,70 @@
+from unittest.mock import patch
+
+import litellm
+
+
+def test_rapid_mlx_provider_routing():
+    """Test that rapid_mlx/ prefix routes correctly as an OpenAI-compatible provider."""
+    with patch(
+        "litellm.main.openai_chat_completions.completion"
+    ) as mock_completion:
+        mock_completion.return_value = {}
+
+        provider = "rapid_mlx"
+        model_name = "default"
+        model = f"{provider}/{model_name}"
+        messages = [{"role": "user", "content": "Hello!"}]
+
+        _ = litellm.completion(
+            model=model,
+            messages=messages,
+            max_tokens=100,
+        )
+
+        mock_completion.assert_called_once()
+        _, call_kwargs = mock_completion.call_args
+        assert call_kwargs.get("custom_llm_provider") == provider
+        assert call_kwargs.get("model") == model_name
+        assert call_kwargs.get("messages") == messages
+        assert call_kwargs.get("api_base") == "http://localhost:8000/v1"
+        assert call_kwargs.get("api_key") == "not-needed"
+
+
+def test_rapid_mlx_custom_api_base():
+    """Test that RAPID_MLX_API_BASE environment variable is respected."""
+    with patch(
+        "litellm.main.openai_chat_completions.completion"
+    ) as mock_completion, patch.dict(
+        "os.environ",
+        {"RAPID_MLX_API_BASE": "http://192.168.1.100:8000/v1"},
+    ):
+        mock_completion.return_value = {}
+
+        _ = litellm.completion(
+            model="rapid_mlx/qwen3.5-9b",
+            messages=[{"role": "user", "content": "test"}],
+        )
+
+        mock_completion.assert_called_once()
+        _, call_kwargs = mock_completion.call_args
+        assert call_kwargs.get("api_base") == "http://192.168.1.100:8000/v1"
+
+
+def test_rapid_mlx_custom_api_key():
+    """Test that RAPID_MLX_API_KEY environment variable is respected."""
+    with patch(
+        "litellm.main.openai_chat_completions.completion"
+    ) as mock_completion, patch.dict(
+        "os.environ",
+        {"RAPID_MLX_API_KEY": "my-secret-key"},
+    ):
+        mock_completion.return_value = {}
+
+        _ = litellm.completion(
+            model="rapid_mlx/default",
+            messages=[{"role": "user", "content": "test"}],
+        )
+
+        mock_completion.assert_called_once()
+        _, call_kwargs = mock_completion.call_args
+        assert call_kwargs.get("api_key") == "my-secret-key"