diff --git a/docs/my-website/docs/providers/rapid_mlx.md b/docs/my-website/docs/providers/rapid_mlx.md new file mode 100644 index 00000000000..469de7d9bcb --- /dev/null +++ b/docs/my-website/docs/providers/rapid_mlx.md @@ -0,0 +1,111 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Rapid-MLX + +Rapid-MLX is an OpenAI-compatible inference server optimized for Apple Silicon (MLX). 2-4x faster than Ollama, with full tool calling, reasoning separation, and prompt caching. + +| Property | Details | +|---|---| +| Description | Local LLM inference server for Apple Silicon. [Docs](https://github.com/raullenchai/Rapid-MLX) | +| Provider Route on LiteLLM | `rapid_mlx/` | +| Provider Doc | [Rapid-MLX ↗](https://github.com/raullenchai/Rapid-MLX) | +| Supported Endpoints | `/chat/completions` | + +## Quick Start + +### Install and start Rapid-MLX + +```bash +brew tap raullenchai/rapid-mlx +brew install rapid-mlx +rapid-mlx serve qwen3.5-9b +``` + +Or install via pip: + +```bash +pip install vllm-mlx +rapid-mlx serve qwen3.5-9b +``` + +## Usage - litellm.completion (calling OpenAI compatible endpoint) + + + + + +```python +import litellm + +response = litellm.completion( + model="rapid_mlx/default", + messages=[{"role": "user", "content": "Hello!"}], +) +print(response.choices[0].message.content) +``` + + + + + +1. Add to config.yaml + +```yaml +model_list: + - model_name: my-model + litellm_params: + model: rapid_mlx/default + api_base: http://localhost:8000/v1 +``` + +2. Start the proxy + +```bash +$ litellm --config /path/to/config.yaml +``` + +3. Send a request + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "my-model", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] +}' +``` + + + + + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `RAPID_MLX_API_KEY` | API key (optional, Rapid-MLX does not require auth by default) | `not-needed` | +| `RAPID_MLX_API_BASE` | Server URL | `http://localhost:8000/v1` | + +## Supported Models + +Any MLX model served by Rapid-MLX works. Use the model name as loaded by the server. Common choices: + +- `rapid_mlx/default` - Whatever model is currently loaded +- `rapid_mlx/qwen3.5-9b` - Best small model for general use +- `rapid_mlx/qwen3.5-35b` - Smart and fast +- `rapid_mlx/qwen3.5-122b` - Frontier-level MoE model + +## Features + +- **Streaming** - Full SSE streaming support +- **Tool calling** - 17 parser formats (Qwen, Hermes, MiniMax, GLM, etc.) +- **Reasoning separation** - Native support for thinking models (Qwen3, DeepSeek-R1) +- **Prompt caching** - KV cache reuse and DeltaNet state snapshots for fast TTFT +- **Multi-Token Prediction** - Speculative decoding for supported models diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 4c0471fb8f4..288b712c7ac 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -971,6 +971,7 @@ const sidebars = { "providers/predibase", "providers/pydantic_ai_agent", "providers/ragflow", + "providers/rapid_mlx", "providers/recraft", "providers/replicate", { diff --git a/litellm/llms/openai_like/providers.json b/litellm/llms/openai_like/providers.json index 275c352b39e..2700361a595 100644 --- a/litellm/llms/openai_like/providers.json +++ b/litellm/llms/openai_like/providers.json @@ -101,5 +101,11 @@ "param_mappings": { "max_completion_tokens": "max_tokens" } + }, + "rapid_mlx": { + "base_url": "http://localhost:8000/v1", + "api_key_env": "RAPID_MLX_API_KEY", + "default_api_key": "not-needed", + "api_base_env": "RAPID_MLX_API_BASE" } } diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 38425c7ac4a..76b4561e0e3 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -3259,6 +3259,7 @@ class LlmProviders(str, Enum): LITELLM_AGENT = "litellm_agent" CURSOR = "cursor" BEDROCK_MANTLE = "bedrock_mantle" + RAPID_MLX = "rapid_mlx" # Create a set of all provider values for quick lookup diff --git a/provider_endpoints_support.json b/provider_endpoints_support.json index 2f3302bb574..17ef4404883 100644 --- a/provider_endpoints_support.json +++ b/provider_endpoints_support.json @@ -471,9 +471,7 @@ "audio_speech": false, "moderations": false, "batches": false, - "rerank": false, - "a2a": false, - "interactions": false + "rerank": false } }, "chutes": { @@ -2554,9 +2552,9 @@ "rerank": false } }, - "charity_engine": { - "display_name": "Charity Engine (`charity_engine`)", - "url": "https://docs.litellm.ai/docs/providers/charity_engine", + "rapid_mlx": { + "display_name": "Rapid-MLX (`rapid_mlx`)", + "url": "https://docs.litellm.ai/docs/providers/rapid_mlx", "endpoints": { "chat_completions": true, "messages": true, diff --git a/tests/test_litellm/llms/rapid_mlx/__init__.py b/tests/test_litellm/llms/rapid_mlx/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/test_litellm/llms/rapid_mlx/test_rapid_mlx_completion.py b/tests/test_litellm/llms/rapid_mlx/test_rapid_mlx_completion.py new file mode 100644 index 00000000000..9044a0e36cf --- /dev/null +++ b/tests/test_litellm/llms/rapid_mlx/test_rapid_mlx_completion.py @@ -0,0 +1,70 @@ +from unittest.mock import patch + +import litellm + + +def test_rapid_mlx_provider_routing(): + """Test that rapid_mlx/ prefix routes correctly as an OpenAI-compatible provider.""" + with patch( + "litellm.main.openai_chat_completions.completion" + ) as mock_completion: + mock_completion.return_value = {} + + provider = "rapid_mlx" + model_name = "default" + model = f"{provider}/{model_name}" + messages = [{"role": "user", "content": "Hello!"}] + + _ = litellm.completion( + model=model, + messages=messages, + max_tokens=100, + ) + + mock_completion.assert_called_once() + _, call_kwargs = mock_completion.call_args + assert call_kwargs.get("custom_llm_provider") == provider + assert call_kwargs.get("model") == model_name + assert call_kwargs.get("messages") == messages + assert call_kwargs.get("api_base") == "http://localhost:8000/v1" + assert call_kwargs.get("api_key") == "not-needed" + + +def test_rapid_mlx_custom_api_base(): + """Test that RAPID_MLX_API_BASE environment variable is respected.""" + with patch( + "litellm.main.openai_chat_completions.completion" + ) as mock_completion, patch.dict( + "os.environ", + {"RAPID_MLX_API_BASE": "http://192.168.1.100:8000/v1"}, + ): + mock_completion.return_value = {} + + _ = litellm.completion( + model="rapid_mlx/qwen3.5-9b", + messages=[{"role": "user", "content": "test"}], + ) + + mock_completion.assert_called_once() + _, call_kwargs = mock_completion.call_args + assert call_kwargs.get("api_base") == "http://192.168.1.100:8000/v1" + + +def test_rapid_mlx_custom_api_key(): + """Test that RAPID_MLX_API_KEY environment variable is respected.""" + with patch( + "litellm.main.openai_chat_completions.completion" + ) as mock_completion, patch.dict( + "os.environ", + {"RAPID_MLX_API_KEY": "my-secret-key"}, + ): + mock_completion.return_value = {} + + _ = litellm.completion( + model="rapid_mlx/default", + messages=[{"role": "user", "content": "test"}], + ) + + mock_completion.assert_called_once() + _, call_kwargs = mock_completion.call_args + assert call_kwargs.get("api_key") == "my-secret-key"