diff --git a/README_CN.md b/README_CN.md index aa3da9f89cc..b5511868e04 100644 --- a/README_CN.md +++ b/README_CN.md @@ -86,6 +86,7 @@ FastDeploy 支持在**英伟达(NVIDIA)GPU**、**昆仑芯(Kunlunxin)XPU - [前缀缓存](./docs/zh/features/prefix_caching.md) - [分块预填充](./docs/zh/features/chunked_prefill.md) - [负载均衡调度Router](./docs/zh/online_serving/router.md) +- [全局Cache池化](./docs/zh/features/global_cache_pooling.md) ## 致谢 diff --git a/README_EN.md b/README_EN.md index 927d6f039b3..4d918455d5f 100644 --- a/README_EN.md +++ b/README_EN.md @@ -84,6 +84,7 @@ Learn how to download models, enable using the torch format, and more: - [Prefix Caching](./docs/features/prefix_caching.md) - [Chunked Prefill](./docs/features/chunked_prefill.md) - [Load-Balancing Scheduling Router](./docs/online_serving/router.md) +- [Global Cache Pooling](./docs/features/global_cache_pooling.md) ## Acknowledgement diff --git a/docs/features/global_cache_pooling.md b/docs/features/global_cache_pooling.md new file mode 100644 index 00000000000..02e66b345a3 --- /dev/null +++ b/docs/features/global_cache_pooling.md @@ -0,0 +1,368 @@ +[中文文档](../../zh/features/global_cache_pooling.md) | English + +# Global Cache Pooling + +This document describes how to use MooncakeStore as the KV Cache storage backend for FastDeploy, enabling **Global Cache Pooling** across multiple inference instances. + +## Overview + +### What is Global Cache Pooling? + +Global Cache Pooling allows multiple FastDeploy instances to share KV Cache through a distributed storage layer. This enables: + +- **Cross-instance cache reuse**: KV Cache computed by one instance can be reused by another +- **PD Disaggregation optimization**: Prefill and Decode instances can share cache seamlessly +- **Reduced computation**: Avoid redundant prefix computation across requests + +### Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Mooncake Master Server │ +│ (Metadata & Coordination Service) │ +└────────────────────────────┬────────────────────────────────────┘ + │ + ┌───────────────────┼───────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ FastDeploy │ │ FastDeploy │ │ FastDeploy │ +│ Instance P │ │ Instance D │ │ Instance X │ +│ (Prefill) │ │ (Decode) │ │ (Standalone) │ +└────────┬────────┘ └────────┬────────┘ └────────┬────────┘ + │ │ │ + └───────────────────┼───────────────────┘ + │ + ┌────────▼────────┐ + │ MooncakeStore │ + │ (Shared KV │ + │ Cache Pool) │ + └─────────────────┘ +``` + +## Example Scripts + +Ready-to-use example scripts are available in [examples/cache_storage/](../../../examples/cache_storage/). + +| Script | Scenario | Description | +|--------|----------|-------------| +| `run.sh` | Multi-Instance | Two standalone instances sharing cache | +| `run_03b_pd_storage.sh` | PD Disaggregation | P+D instances with global cache pooling | + +## Prerequisites + +### Hardware Requirements + +- NVIDIA GPU with CUDA support +- RDMA network (recommended for production) or TCP + +### Software Requirements + +- Python 3.8+ +- CUDA 11.8+ +- FastDeploy (see installation below) + +## Installation + +Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) for FastDeploy installation. + +```bash +# Option 1: Install from PyPI +pip install fastdeploy-gpu + +# Option 2: Build from source +bash build.sh +pip install ./dist/fastdeploy*.whl +``` + +MooncakeStore is automatically installed when you install FastDeploy. + +## Configuration + +We support two ways to configure MooncakeStore: via configuration file `mooncake_config.json` or via environment variables. + +### Mooncake Configuration File + +Create a `mooncake_config.json` file: + +```json +{ + "metadata_server": "http://0.0.0.0:15002/metadata", + "master_server_addr": "0.0.0.0:15001", + "global_segment_size": 1000000000, + "local_buffer_size": 134217728, + "protocol": "rdma", + "rdma_devices": "" +} +``` + +Set the `MOONCAKE_CONFIG_PATH` environment variable to enable the configuration: + +```bash +export MOONCAKE_CONFIG_PATH=path/to/mooncake_config.json +``` + +Configuration parameters: + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `metadata_server` | HTTP metadata server URL | Required | +| `master_server_addr` | Master server address | Required | +| `global_segment_size` | Memory space each TP process shares to global shared memory (bytes) | 1GB | +| `local_buffer_size` | Local buffer size for data transfer (bytes) | 128MB | +| `protocol` | Transfer protocol: `rdma` or `tcp` | `rdma` | +| `rdma_devices` | RDMA device names (comma-separated) | Auto-detect | + +### Environment Variables + +Mooncake can also be configured via environment variables: + +| Variable | Description | +|----------|-------------| +| `MOONCAKE_MASTER_SERVER_ADDR` | Master server address (e.g., `10.0.0.1:15001`) | +| `MOONCAKE_METADATA_SERVER` | Metadata server URL | +| `MOONCAKE_GLOBAL_SEGMENT_SIZE` | Memory space each TP process shares to global shared memory (bytes) | +| `MOONCAKE_LOCAL_BUFFER_SIZE` | Local buffer size (bytes) | +| `MOONCAKE_PROTOCOL` | Transfer protocol (`rdma` or `tcp`) | +| `MOONCAKE_RDMA_DEVICES` | RDMA device names | + +## Usage Scenarios + +### Scenario 1: Multi-Instance Cache Sharing + +Run multiple FastDeploy instances sharing a global KV Cache pool. + +**Step 1: Start Mooncake Master** + +```bash +mooncake_master \ + --port=15001 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=15002 \ + --metrics_port=15003 +``` + +**Step 2: Start FastDeploy Instances** + +Instance 0: +```bash +export MOONCAKE_CONFIG_PATH="./mooncake_config.json" +export CUDA_VISIBLE_DEVICES=0 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52700 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --kvcache-storage-backend mooncake +``` + +Instance 1: +```bash +export MOONCAKE_CONFIG_PATH="./mooncake_config.json" +export CUDA_VISIBLE_DEVICES=1 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52800 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --kvcache-storage-backend mooncake +``` + +**Step 3: Test Cache Reuse** + +Send the same prompt to both instances. The second instance should reuse the KV Cache computed by the first instance. + +```bash +# Request to Instance 0 +curl -X POST "http://0.0.0.0:52700/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "Hello, world!"}], "max_tokens": 50}' + +# Request to Instance 1 (should hit cached KV) +curl -X POST "http://0.0.0.0:52800/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "Hello, world!"}], "max_tokens": 50}' +``` + +### Scenario 2: PD Disaggregation with Global Cache + +This scenario combines **PD Disaggregation** with **Global Cache Pooling**, enabling: + +- Prefill instances to read Decode instances' output cache +- Optimal multi-turn conversation performance + +**Architecture:** + +``` + ┌──────────────────────────────────────────┐ + │ Router │ + │ (Load Balancer) │ + └─────────────────┬────────────────────────┘ + │ + ┌───────────────┴───────────────┐ + │ │ + ▼ ▼ + ┌─────────────┐ ┌─────────────┐ + │ Prefill │ │ Decode │ + │ Instance │◄───────────────►│ Instance │ + │ │ KV Transfer │ │ + └──────┬──────┘ └──────┬──────┘ + │ │ + └───────────────┬───────────────┘ + │ + ┌────────▼────────┐ + │ MooncakeStore │ + │ (Global Cache) │ + └─────────────────┘ +``` + +**Step 1: Start Mooncake Master** + +```bash +mooncake_master \ + --port=15001 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=15002 +``` + +**Step 2: Start Router** + +```bash +python -m fastdeploy.router.launch \ + --port 52700 \ + --splitwise +``` + +**Step 3: Start Prefill Instance** + +```bash +export MOONCAKE_MASTER_SERVER_ADDR="127.0.0.1:15001" +export MOONCAKE_METADATA_SERVER="http://127.0.0.1:15002/metadata" +export MOONCAKE_PROTOCOL="rdma" +export CUDA_VISIBLE_DEVICES=0 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52400 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --splitwise-role prefill \ + --cache-transfer-protocol rdma \ + --router "0.0.0.0:52700" \ + --kvcache-storage-backend mooncake +``` + +**Step 4: Start Decode Instance** + +```bash +export MOONCAKE_MASTER_SERVER_ADDR="127.0.0.1:15001" +export MOONCAKE_METADATA_SERVER="http://127.0.0.1:15002/metadata" +export MOONCAKE_PROTOCOL="rdma" +export CUDA_VISIBLE_DEVICES=1 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52500 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --splitwise-role decode \ + --cache-transfer-protocol rdma \ + --router "0.0.0.0:52700" \ + --enable-output-caching \ + --kvcache-storage-backend mooncake +``` + +**Step 5: Send Requests via Router** + +```bash +curl -X POST "http://0.0.0.0:52700/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "Hello!"}], "max_tokens": 50}' +``` + +## FastDeploy Parameters for Mooncake + +| Parameter | Description | +|-----------|-------------| +| `--kvcache-storage-backend mooncake` | Enable Mooncake as KV Cache storage backend | +| `--enable-output-caching` | Enable output token caching (Decode instance recommended) | +| `--cache-transfer-protocol rdma` | Use RDMA for KV transfer between P and D | +| `--splitwise-role prefill/decode` | Set instance role in PD disaggregation | +| `--router` | Router address for PD disaggregation | + +## Verification + +### Check Cache Hit + +To verify cache hit in logs: + +```bash +# For multi-instance scenario +grep -E "storage_cache_token_num" log_*/api_server.log + +# For PD disaggregation scenario +grep -E "storage_cache_token_num" log_prefill/api_server.log +``` + +If `storage_cache_token_num > 0`, the instance successfully read cached KV blocks from the global pool. + +### Monitor Mooncake Master + +```bash +# Check master status +curl http://localhost:15002/metadata + +# Check metrics (if metrics_port is configured) +curl http://localhost:15003/metrics +``` + +## Troubleshooting + +### Common Issues + +**1. Port Already in Use** + +```bash +# Check port usage +ss -ltn | grep 15001 + +# Kill existing process +kill -9 $(lsof -t -i:15001) +``` + +**2. RDMA Connection Failed** + +- Verify RDMA devices: `ibv_devices` +- Check RDMA network: `ibv_devinfo` +- Fallback to TCP: Set `MOONCAKE_PROTOCOL=tcp` + +**3. Cache Not Being Shared** + +- Verify all instances connect to the same Mooncake master +- Check metadata server URL is consistent +- Verify `global_segment_size` is large enough + +**4. Permission Denied on /dev/shm** + +```bash +# Clean up stale shared memory files +find /dev/shm -type f -print0 | xargs -0 rm -f +``` + +### Debug Mode + +Enable debug logging: + +```bash +export FD_DEBUG=1 +``` + +## More Resources + +- [Mooncake Official Documentation](https://github.com/kvcache-ai/Mooncake) +- [Mooncake Troubleshooting Guide](https://github.com/kvcache-ai/Mooncake/blob/main/docs/source/troubleshooting/troubleshooting.md) +- [FastDeploy Documentation](https://paddlepaddle.github.io/FastDeploy/) diff --git a/docs/zh/features/global_cache_pooling.md b/docs/zh/features/global_cache_pooling.md new file mode 100644 index 00000000000..292e764ac80 --- /dev/null +++ b/docs/zh/features/global_cache_pooling.md @@ -0,0 +1,367 @@ +[English](../../features/global_cache_pooling.md) | 中文文档 + +# 全局缓存池化 + +本文档介绍如何将 MooncakeStore 作为 FastDeploy 的 KV Cache 存储后端,实现多推理实例间的**全局缓存池化**。 + +## 概述 + +### 什么是全局缓存池化? + +全局缓存池化允许多个 FastDeploy 实例通过分布式存储层共享 KV Cache,具有以下优势: + +- **跨实例缓存复用**:一个实例计算的 KV Cache 可被其他实例复用 +- **PD 分离架构优化**:Prefill 和 Decode 实例可无缝共享缓存 +- **减少重复计算**:避免跨请求的重复前缀计算 + +### 架构图 + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Mooncake Master 服务 │ +│ (元数据与协调服务) │ +└────────────────────────────┬────────────────────────────────────┘ + │ + ┌───────────────────┼───────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ FastDeploy │ │ FastDeploy │ │ FastDeploy │ +│ Instance P │ │ Instance D │ │ Instance X │ +│ (Prefill) │ │ (Decode) │ │ (Standalone) │ +└────────┬────────┘ └────────┬────────┘ └────────┬────────┘ + │ │ │ + └───────────────────┼───────────────────┘ + │ + ┌────────▼────────┐ + │ MooncakeStore │ + │ (共享 KV │ + │ Cache 池) │ + └─────────────────┘ +``` + +## 示例脚本 + +开箱即用的示例脚本位于 [examples/cache_storage/](../../../examples/cache_storage/)。 + +| 脚本 | 场景 | 说明 | +|------|------|------| +| `run.sh` | 多实例缓存共享 | 两个独立实例共享缓存 | +| `run_03b_pd_storage.sh` | PD 分离 | P+D 实例配合全局缓存池 | + +## 环境要求 + +### 硬件要求 + +- 支持 CUDA 的 NVIDIA GPU +- RDMA 网络(生产环境推荐)或 TCP 网络 + +### 软件要求 + +- Python 3.8+ +- CUDA 11.8+ +- FastDeploy(见下方安装说明) + +## 安装步骤 + +参考 [NVIDIA CUDA GPU 安装指南](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) 安装 FastDeploy。 + +```bash +# 方式一:从 PyPI 安装 +pip install fastdeploy-gpu + +# 方式二:从源码编译 +bash build.sh +pip install ./dist/fastdeploy*.whl +``` + +安装FastDeploy后自动安装了MooncakeStore。 + +## 配置说明 + +我们支持两种方式配置MooncakeStore,一是通过配置文件`mooncake_config.json`,二是通过环境变量进行配置。 + +### Mooncake 配置文件 + +创建 `mooncake_config.json` 配置文件: + +```json +{ + "metadata_server": "http://0.0.0.0:15002/metadata", + "master_server_addr": "0.0.0.0:15001", + "global_segment_size": 1000000000, + "local_buffer_size": 134217728, + "protocol": "rdma", + "rdma_devices": "" +} +``` + +设置MOONCAKE_CONFIG_PATH环境变量后,配置文件生效: +```bash +export MOONCAKE_CONFIG_PATH=path/to/mooncake_config.json +``` + +配置参数说明: + +| 参数 | 说明 | 默认值 | +|------|------|--------| +| `metadata_server` | HTTP 元数据服务地址 | 必填 | +| `master_server_addr` | Master 服务地址 | 必填 | +| `global_segment_size` | 每个TP进程给全局共享内存共享的内存空间(字节) | 1GB | +| `local_buffer_size` | 数据传输本地缓冲区大小(字节) | 128MB | +| `protocol` | 传输协议:`rdma` 或 `tcp` | `rdma` | +| `rdma_devices` | RDMA 设备名称(逗号分隔) | 自动检测 | + +### 环境变量配置 + +Mooncake 也支持通过环境变量进行配置: + +| 环境变量 | 说明 | +|----------|------| +| `MOONCAKE_MASTER_SERVER_ADDR` | Master 服务地址(如 `10.0.0.1:15001`) | +| `MOONCAKE_METADATA_SERVER` | 元数据服务 URL | +| `MOONCAKE_GLOBAL_SEGMENT_SIZE` | 每个TP进程给全局共享内存共享的内存空间(字节) | +| `MOONCAKE_LOCAL_BUFFER_SIZE` | 本地缓冲区大小(字节) | +| `MOONCAKE_PROTOCOL` | 传输协议(`rdma` 或 `tcp`) | +| `MOONCAKE_RDMA_DEVICES` | RDMA 设备名称 | + +## 使用场景 + +### 场景一:多实例缓存共享 + +运行多个 FastDeploy 实例,共享全局 KV Cache 池。 + +**步骤 1:启动 Mooncake Master** + +```bash +mooncake_master \ + --port=15001 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=15002 \ + --metrics_port=15003 +``` + +**步骤 2:启动 FastDeploy 实例** + +实例 0: +```bash +export MOONCAKE_CONFIG_PATH="./mooncake_config.json" +export CUDA_VISIBLE_DEVICES=0 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52700 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --kvcache-storage-backend mooncake +``` + +实例 1: +```bash +export MOONCAKE_CONFIG_PATH="./mooncake_config.json" +export CUDA_VISIBLE_DEVICES=1 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52800 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --kvcache-storage-backend mooncake +``` + +**步骤 3:测试缓存复用** + +向两个实例发送相同的 prompt,第二个实例应能复用第一个实例计算的 KV Cache。 + +```bash +# 请求实例 0 +curl -X POST "http://0.0.0.0:52700/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "Hello, world!"}], "max_tokens": 50}' + +# 请求实例 1(应命中缓存) +curl -X POST "http://0.0.0.0:52800/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "Hello, world!"}], "max_tokens": 50}' +``` + +### 场景二:PD 分离 + 全局缓存池 + +此场景将 **PD 分离架构** 与 **全局缓存池化** 结合,实现: + +- Prefill 实例可读取 Decode 实例的输出缓存 +- 优化多轮对话性能 + +**架构图:** + +``` + ┌──────────────────────────────────────────┐ + │ Router │ + │ (负载均衡器) │ + └─────────────────┬────────────────────────┘ + │ + ┌───────────────┴───────────────┐ + │ │ + ▼ ▼ + ┌─────────────┐ ┌─────────────┐ + │ Prefill │ │ Decode │ + │ Instance │◄───────────────►│ Instance │ + │ │ KV Transfer │ │ + └──────┬──────┘ └──────┬──────┘ + │ │ + └───────────────┬───────────────┘ + │ + ┌────────▼────────┐ + │ MooncakeStore │ + │ (全局缓存池) │ + └─────────────────┘ +``` + +**步骤 1:启动 Mooncake Master** + +```bash +mooncake_master \ + --port=15001 \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=15002 +``` + +**步骤 2:启动 Router** + +```bash +python -m fastdeploy.router.launch \ + --port 52700 \ + --splitwise +``` + +**步骤 3:启动 Prefill 实例** + +```bash +export MOONCAKE_MASTER_SERVER_ADDR="127.0.0.1:15001" +export MOONCAKE_METADATA_SERVER="http://127.0.0.1:15002/metadata" +export MOONCAKE_PROTOCOL="rdma" +export CUDA_VISIBLE_DEVICES=0 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52400 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --splitwise-role prefill \ + --cache-transfer-protocol rdma \ + --router "0.0.0.0:52700" \ + --kvcache-storage-backend mooncake +``` + +**步骤 4:启动 Decode 实例** + +```bash +export MOONCAKE_MASTER_SERVER_ADDR="127.0.0.1:15001" +export MOONCAKE_METADATA_SERVER="http://127.0.0.1:15002/metadata" +export MOONCAKE_PROTOCOL="rdma" +export CUDA_VISIBLE_DEVICES=1 + +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port 52500 \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --splitwise-role decode \ + --cache-transfer-protocol rdma \ + --router "0.0.0.0:52700" \ + --enable-output-caching \ + --kvcache-storage-backend mooncake +``` + +**步骤 5:通过 Router 发送请求** + +```bash +curl -X POST "http://0.0.0.0:52700/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "Hello!"}], "max_tokens": 50}' +``` + +## FastDeploy Mooncake 相关参数 + +| 参数 | 说明 | +|------|------| +| `--kvcache-storage-backend mooncake` | 启用 Mooncake 作为 KV Cache 存储后端 | +| `--enable-output-caching` | 启用输出 token 缓存(推荐 Decode 实例开启) | +| `--cache-transfer-protocol rdma` | P 和 D 之间使用 RDMA 进行 KV 传输 | +| `--splitwise-role prefill/decode` | 设置实例在 PD 分离中的角色 | +| `--router` | PD 分离场景下的 Router 地址 | + +## 验证方法 + +### 检查缓存命中 + +通过日志验证缓存命中情况: + +```bash +# 多实例场景 +grep -E "storage_cache_token_num" log_*/api_server.log + +# PD 分离场景 +grep -E "storage_cache_token_num" log_prefill/api_server.log +``` + +如果 `storage_cache_token_num > 0`,表示实例成功从全局池读取了缓存的 KV 块。 + +### 监控 Mooncake Master + +```bash +# 检查 master 状态 +curl http://localhost:15002/metadata + +# 检查指标(如配置了 metrics_port) +curl http://localhost:15003/metrics +``` + +## 故障排查 + +### 常见问题 + +**1. 端口被占用** + +```bash +# 检查端口使用情况 +ss -ltn | grep 15001 + +# 终止占用进程 +kill -9 $(lsof -t -i:15001) +``` + +**2. RDMA 连接失败** + +- 检查 RDMA 设备:`ibv_devices` +- 检查 RDMA 网络:`ibv_devinfo` +- 降级使用 TCP:设置 `MOONCAKE_PROTOCOL=tcp` + +**3. 缓存未共享** + +- 确认所有实例连接到同一个 Mooncake master +- 检查元数据服务 URL 是否一致 +- 确认 `global_segment_size` 足够大 + +**4. /dev/shm 权限不足** + +```bash +# 清理残留的共享内存文件 +find /dev/shm -type f -print0 | xargs -0 rm -f +``` + +### 调试模式 + +开启调试日志: + +```bash +export FD_DEBUG=1 +``` + +## 更多资源 + +- [Mooncake 官方文档](https://github.com/kvcache-ai/Mooncake) +- [Mooncake 故障排查指南](https://github.com/kvcache-ai/Mooncake/blob/main/docs/source/troubleshooting/troubleshooting.md) +- [FastDeploy 文档](https://paddlepaddle.github.io/FastDeploy/) diff --git a/examples/cache_storage/README.md b/examples/cache_storage/README.md index 5b3f7f9c99b..becefbf58e8 100644 --- a/examples/cache_storage/README.md +++ b/examples/cache_storage/README.md @@ -1,57 +1,31 @@ -# MooncakeStore for FastDeploy +# Global Cache Pooling Examples -This document describes how to use MooncakeStore as the backend of FastDeploy. +This directory contains example scripts for Global Cache Pooling with MooncakeStore. -## Preparation +## Documentation -### Install FastDeploy +- [English Documentation](../../docs/features/global_cache_pooling.md) +- [中文文档](../../docs/zh/features/global_cache_pooling.md) -Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) for Fastdeploy installation. - -### Install MooncakeStore +## Quick Start ```bash -pip install mooncake-transfer-engine -``` - -## Run Examples - -The example script is provided in `run.sh`. You can run it directly: -``` +# Multi-instance scenario bash run.sh -``` -In the example script, we will start a Mooncake master server and two FastDeploy server. - -Launch Mooncake master server: -```bash -mooncake_master \ - --port=15001 \ - --enable_http_metadata_server=true \ - --http_metadata_server_host=0.0.0.0 \ - --http_metadata_server_port=15002 \ - --metrics_port=15003 \ +# PD disaggregation scenario +bash run_03b_pd_storage.sh ``` -More parameter can be found in the [official guide](https://github.com/kvcache-ai/Mooncake/blob/main/docs/source/python-api-reference/transfer-engine.md). - -Launch the Fastdeploy with Mooncake enabled. +## Scripts -```bash -export MOONCAKE_CONFIG_PATH="./mooncake_config.json" - -python -m fastdeploy.entrypoints.openai.api_server \ - --model ${MODEL_NAME} \ - --port ${PORT} \ - --metrics-port $((PORT + 1)) \ - --engine-worker-queue-port $((PORT + 2)) \ - --cache-queue-port $((PORT + 3)) \ - --max-model-len 32768 \ - --max-num-seqs 32 \ - --kvcache-storage-backend mooncake -``` +| Script | Scenario | Description | +|--------|----------|-------------| +| `run.sh` | Multi-Instance | Two standalone instances sharing cache | +| `run_03b_pd_storage.sh` | PD Disaggregation | P+D instances with global cache pooling | -## Troubleshooting +## Files -For more details, please refer to: -https://github.com/kvcache-ai/Mooncake/blob/main/docs/source/troubleshooting/troubleshooting.md +- `mooncake_config.json` - Mooncake configuration file +- `utils.sh` - Utility functions for scripts +- `stop.sh` - Stop all running services diff --git a/examples/cache_storage/mooncake_config.json b/examples/cache_storage/mooncake_config.json index 560f3bda9a0..225d05f3cdc 100644 --- a/examples/cache_storage/mooncake_config.json +++ b/examples/cache_storage/mooncake_config.json @@ -1,7 +1,6 @@ { - "local_hostname":"localhost", "metadata_server":"http://0.0.0.0:15002/metadata", - "global_segment_size":8589934592, + "global_segment_size":1000000000, "local_buffer_size":134217728, "protocol":"rdma", "rdma_devices": "", diff --git a/examples/cache_storage/run.sh b/examples/cache_storage/run.sh index 67424b367bc..5447367225b 100644 --- a/examples/cache_storage/run.sh +++ b/examples/cache_storage/run.sh @@ -2,6 +2,7 @@ set -e export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle" +# export MODEL_NAME="/work/models/PaddlePaddle/ERNIE-4.5-0.3B-Paddle" export MOONCAKE_CONFIG_PATH=./mooncake_config.json export FD_DEBUG=1 @@ -15,8 +16,8 @@ S0_PORT=52700 S1_PORT=52800 ports=( - $S0_PORT $((S0_PORT + 1)) $((S0_PORT + 2)) $((S0_PORT + 3)) - $S1_PORT $((S1_PORT + 1)) $((S1_PORT + 2)) $((S1_PORT + 3)) + $S0_PORT + $S1_PORT ) check_ports "${ports[@]}" || { echo "❌ Some ports are in use. Please release them." @@ -41,9 +42,6 @@ echo "server 0 port: ${S0_PORT}" nohup python -m fastdeploy.entrypoints.openai.api_server \ --model ${MODEL_NAME} \ --port ${S0_PORT} \ - --metrics-port $((S0_PORT + 1)) \ - --engine-worker-queue-port $((S0_PORT + 2)) \ - --cache-queue-port $((S0_PORT + 3)) \ --max-model-len 32768 \ --max-num-seqs 32 \ --kvcache-storage-backend mooncake \ @@ -58,9 +56,6 @@ echo "server 1 port: ${S1_PORT}" nohup python -m fastdeploy.entrypoints.openai.api_server \ --model ${MODEL_NAME} \ --port ${S1_PORT} \ - --metrics-port $((S1_PORT + 1)) \ - --engine-worker-queue-port $((S1_PORT + 2)) \ - --cache-queue-port $((S1_PORT + 3)) \ --max-model-len 32768 \ --max-num-seqs 32 \ --kvcache-storage-backend mooncake \ diff --git a/examples/cache_storage/run_03b_pd_storage.sh b/examples/cache_storage/run_03b_pd_storage.sh new file mode 100644 index 00000000000..5577a0ebf27 --- /dev/null +++ b/examples/cache_storage/run_03b_pd_storage.sh @@ -0,0 +1,219 @@ +#!/bin/bash +set -e + +# ============================================================================= +# PD Disaggregation + Global Cache Pooling Test Script +# Reference: start_v1_tp1.sh (PD Disaggregation) + run.sh (Mooncake Cache Pooling) +# Note: Modify CUDA_VISIBLE_DEVICES environment variables for PD instances +# ============================================================================= + +# ======================== Environment Variables Configuration ======================== +export MODEL_NAME="/work/models/PaddlePaddle/ERNIE-4.5-0.3B-Paddle" +export FD_DEBUG=1 + +# Mooncake Configuration (using environment variables) +master_ip="127.0.0.1" +master_port=15001 +metadata_port=15002 + +export MOONCAKE_MASTER_SERVER_ADDR="${master_ip}:${master_port}" +export MOONCAKE_METADATA_SERVER="http://${master_ip}:${metadata_port}/metadata" +export MOONCAKE_GLOBAL_SEGMENT_SIZE="50000000000" +# export MOONCAKE_PROTOCOL="tcp" +export MOONCAKE_PROTOCOL="rdma" +# export MOONCAKE_RDMA_DEVICES="mlx5_0" + +# ======================== Port Configuration ======================== +P_PORT=52400 +D_PORT=52500 +ROUTER_PORT=52700 +LOG_DATE=$(date +%Y%m%d_%H%M%S) + +# ======================== Cleanup and Preparation ======================== +unset http_proxy && unset https_proxy +rm -rf log_* + +source ./utils.sh + +# Check ports +ports=($P_PORT $D_PORT $ROUTER_PORT $master_port $metadata_port) +check_ports "${ports[@]}" || { + echo "❌ Some ports are in use. Please release them." + exit 1 +} + +# ======================== Start Mooncake Master ======================== +echo "=== Starting Mooncake Master ===" +export FD_LOG_DIR="log_master" +mkdir -p ${FD_LOG_DIR} + +nohup mooncake_master \ + --port=${master_port} \ + --enable_http_metadata_server=true \ + --http_metadata_server_host=0.0.0.0 \ + --http_metadata_server_port=${metadata_port} \ + 2>&1 > ${FD_LOG_DIR}/nohup & + +sleep 2 # Wait for Mooncake Master to start + +# ======================== Start Router ======================== +echo "=== Starting Router ===" +export FD_LOG_DIR="log_router" +mkdir -p ${FD_LOG_DIR} +echo "Router log: ${FD_LOG_DIR}, port: ${ROUTER_PORT}" + +nohup python -m fastdeploy.router.launch \ + --port ${ROUTER_PORT} \ + --splitwise \ + 2>&1 > ${FD_LOG_DIR}/nohup & + +sleep 2 # Wait for Router to start + +# ======================== Start P Instance (Prefill) ======================== +echo "=== Starting Prefill Instance ===" +export CUDA_VISIBLE_DEVICES=3 +export FD_LOG_DIR="log_prefill" +mkdir -p ${FD_LOG_DIR} +echo "Prefill log: ${FD_LOG_DIR}, port: ${P_PORT}, GPU: ${CUDA_VISIBLE_DEVICES}" + +nohup python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port ${P_PORT} \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --splitwise-role prefill \ + --cache-transfer-protocol rdma \ + --router "0.0.0.0:${ROUTER_PORT}" \ + --kvcache-storage-backend mooncake \ + 2>&1 > ${FD_LOG_DIR}/nohup & + + +# ======================== Start D Instance (Decode) ======================== +echo "=== Starting Decode Instance ===" +export CUDA_VISIBLE_DEVICES=7 +export FD_LOG_DIR="log_decode" +mkdir -p ${FD_LOG_DIR} +echo "Decode log: ${FD_LOG_DIR}, port: ${D_PORT}, GPU: ${CUDA_VISIBLE_DEVICES}" + +nohup python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port ${D_PORT} \ + --max-model-len 32768 \ + --max-num-seqs 32 \ + --splitwise-role decode \ + --cache-transfer-protocol rdma \ + --router "0.0.0.0:${ROUTER_PORT}" \ + --enable-output-caching \ + --kvcache-storage-backend mooncake \ + 2>&1 > ${FD_LOG_DIR}/nohup & + + +# ======================== Wait for Services to be Ready ======================== +echo "=== Waiting for services to be ready ===" +wait_for_health ${P_PORT} +wait_for_health ${D_PORT} + +# Wait for services to register to Router +sleep 10 +echo "✅ All services are ready!" + +# ======================== Send Test Requests ======================== +# Test scenario: Multi-turn conversation, verify that output cache written by D instance can be read by P instance +# +# Flow: +# 1. Request 1: Send first round question, D instance generates answer and writes to global cache (prompt + output) +# 2. Request 2: Send second round conversation (first round Q&A + follow-up), P instance should hit global cache for first round's complete KV cache +# +echo "" +echo "=== Multi-turn Conversation Test for Global Cache Pooling ===" + +# First round question +msg1="深圳是中国经济实力最强的城市之一。近年来,深圳 GDP 持续稳步增长,2023 年突破 3.4 万亿元人民币,2024 年接近 3.7 万亿元,长期位居全国城市前列。深圳经济以第二产业和第三产业为主,高端制造业、电子信息产业和现代服务业发达,形成了以科技创新为核心的产业结构。依托华为、腾讯、大疆等龙头企业,深圳在数字经济、人工智能、新能源等领域具有显著优势。同时,深圳进出口总额常年位居全国城市第一,是中国对外开放和高质量发展的重要引擎。深圳2024年 GDP 是多少?" + +echo "" +echo ">>> Request 1: First round question" +echo " Purpose: D instance generates output and writes to global cache (prompt + output)" +echo "" + +# Send first round request and get response +response1=$(curl -s -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"messages\": [ + {\"role\": \"user\", \"content\": \"${msg1}\"} + ], + \"max_tokens\": 200, + \"min_tokens\": 130, + \"stream\": false, + \"top_p\": 0 + }") + +echo "Response 1:" +echo "${response1}" | python3 -m json.tool 2>/dev/null || echo "${response1}" + +# Extract first round response content +assistant_reply=$(echo "${response1}" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data['choices'][0]['message']['content'])" 2>/dev/null || echo "") + +if [ -z "${assistant_reply}" ]; then + echo "❌ Failed to get response from Request 1" + exit 1 +fi + +# JSON escape assistant_reply to prevent newlines, quotes, and other special characters from breaking JSON format +assistant_reply_escaped=$(python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))" <<< "${assistant_reply}") + +echo "" +echo "Assistant reply extracted: ${assistant_reply}..." + +# Wait for D instance to write output cache to global storage +echo "" +echo ">>> Waiting for D instance to write output cache to global storage..." +sleep 5 + +# Second round follow-up question +msg2="那深圳2023年的GDP是多少?和2024年相比增长了多少?" + +echo "" +echo ">>> Request 2: Second round (multi-turn conversation)" +echo " Purpose: P instance should hit global cache including D's output from Request 1" +echo " Check log_prefill/nohup for 'storage_match' to verify cache hit" +echo "" + +# Send second round request (including complete multi-turn conversation history) +response2=$(curl -s -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"messages\": [ + {\"role\": \"user\", \"content\": \"${msg1}\"}, + {\"role\": \"assistant\", \"content\": ${assistant_reply_escaped}}, + {\"role\": \"user\", \"content\": \"${msg2}\"} + ], + \"max_tokens\": 100, + \"stream\": false, + \"top_p\": 0 + }") + +echo "Response 2:" +echo "${response2}" | python3 -m json.tool 2>/dev/null || echo "${response2}" + +# Extract second round response content and display +assistant_reply2=$(echo "${response2}" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data['choices'][0]['message']['content'])" 2>/dev/null || echo "") +echo "" +echo "Assistant reply 2: ${assistant_reply2}" + +echo "" +echo "" +echo "=== Test completed ===" +echo "" +echo "Verification Steps:" +echo "1. Check log_prefill/nohup for Request 2's cache hit info:" +echo " grep -E 'storage_match|cache_hit|matched.*block' log_prefill/nohup" +echo "" +echo "2. If 'storage_match_token_num > 0' in Request 2, it means P instance" +echo " successfully read the output cache written by D instance from Request 1" +echo "" +echo "Log files:" +echo " - Prefill: log_prefill/nohup" +echo " - Decode: log_decode/nohup" +echo " - Router: log_router/nohup" +echo " - Master: log_master/nohup" diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index b022f61c26e..9be2ca4eb61 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -1139,6 +1139,73 @@ def write_cache_to_storage(self, request: Request): cost_time = time.time() - tic logger.info(f"finish write cache back to storage, req_id: {req_id}, cost_time: {cost_time:.6f}s") + def write_cache_to_storage_decode(self, request: Request): + """ + D instance (Decode Node) simplified write method, does not rely on Radix Tree. + + D instance does not maintain Radix Tree, so it cannot get keys through req_leaf_map. + Need to calculate cache keys directly based on token_ids. + + Key generation algorithm is exactly the same as P instance (chained hash): + - Block 0: key_0 = get_hash_str(token_ids[0:block_size], []) + - Block 1: key_1 = get_hash_str(token_ids[block_size:2*block_size], [key_0]) + - Block n: key_n = get_hash_str(token_ids[n*block_size:(n+1)*block_size], [key_{n-1}]) + + Incremental write logic is handled by CacheTransferManager. + """ + if self.kvcache_storage_backend is None: + return + + # 1. Get complete token_ids + token_ids = request.prompt_token_ids + if isinstance(token_ids, np.ndarray): + token_ids = token_ids.tolist() + else: + token_ids = list(token_ids) + + if self.config.cache_config.enable_output_caching: + token_ids = token_ids + request.output_token_ids + + # 2. Calculate cache keys using chained hash (consistent with P instance) + keys = [] + prefix_block_key = [] # Initial is empty list + block_size = self.config.cache_config.block_size + + for i in range(0, len(token_ids), block_size): + block_token_ids = token_ids[i : i + block_size] + if len(block_token_ids) < block_size: + break # Do not cache incomplete block + + # Calculate hash key for current block + key = get_hash_str(block_token_ids, prefix_block_key) + keys.append(key) + + # Update prefix_block_key to current key (for next block) + prefix_block_key = [key] + + if not keys: + return + + # 3. Get corresponding gpu_block_ids + gpu_block_ids = request.block_tables[: len(keys)] + + # 4. Construct WriteStorageTask and send + # Incremental logic is handled by CacheTransferManager.write_back_storage_task() + req_id = request.request_id + logger.info(f"[D instance] start write cache to storage, req_id: {req_id}, block num: {len(keys)}") + + write_storage_task = WriteStorageTask( + task_id=req_id, + keys=keys, + token_ids=token_ids, + gpu_block_ids=gpu_block_ids, + ) + + tic = time.time() + self.issue_write_back_storage_task(write_storage_task, is_sync=True) + cost_time = time.time() - tic + logger.info(f"[D instance] finish write cache to storage, req_id: {req_id}, cost_time: {cost_time:.6f}s") + def issue_write_back_storage_task(self, task: WriteStorageTask, is_sync=True): if self.kvcache_storage_backend is None: return diff --git a/fastdeploy/cache_manager/transfer_factory/mooncake_store/mooncake_store.py b/fastdeploy/cache_manager/transfer_factory/mooncake_store/mooncake_store.py index 34f83328417..13273b11e5a 100644 --- a/fastdeploy/cache_manager/transfer_factory/mooncake_store/mooncake_store.py +++ b/fastdeploy/cache_manager/transfer_factory/mooncake_store/mooncake_store.py @@ -79,6 +79,11 @@ def create() -> "MooncakeStoreConfig": logger.info(f"No RDMA devices specified, defaulting to all available devices: {rdma_devices}") if metadata_server is None or master_server_addr is None: raise ValueError("Both MOONCAKE_METADATA_SERVER and MOONCAKE_MASTER_SERVER_ADDR must be provided.") + if local_hostname == "localhost": + raise ValueError( + "The local hostname for mooncake must not be `localhost` to avoid " + "potential error, please do not manually set this param." + ) return MooncakeStoreConfig( local_hostname=local_hostname, diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 889b11cbdc2..f07c2d4157b 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -558,8 +558,6 @@ def __post_init__(self): if not self.tokenizer: self.tokenizer = self.model - if self.splitwise_role == "decode": - self.enable_prefix_caching = False if ( not current_platform.is_cuda() and not current_platform.is_xpu() diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 9205f1c05c5..9960d4afaf6 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -694,7 +694,11 @@ def exist_prefill(self, scheduled_reqs): return False def cache_output_tokens(self, request): - if self.config.cache_config.enable_prefix_caching and self.config.cache_config.enable_output_caching: + if ( + self.config.cache_config.enable_prefix_caching + and self.config.cache_config.enable_output_caching + and self.config.scheduler_config.splitwise_role != "decode" + ): with self.lock: if request.num_computed_tokens >= request.need_prefill_tokens: # request is decoding self.cache_manager.cache_output_blocks(request, self.config.cache_config.block_size) @@ -863,7 +867,10 @@ def _allocate_decode_and_extend(): scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) token_budget -= num_new_tokens request.num_computed_tokens += num_new_tokens - if self.config.cache_config.enable_prefix_caching: + if ( + self.config.cache_config.enable_prefix_caching + and self.config.scheduler_config.splitwise_role != "decode" + ): self.cache_manager.update_cache_blocks( request, self.config.cache_config.block_size, request.num_computed_tokens ) @@ -939,7 +946,10 @@ def _allocate_decode_and_extend(): scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) token_budget -= num_new_tokens request.num_computed_tokens += num_new_tokens - if self.config.cache_config.enable_prefix_caching: + if ( + self.config.cache_config.enable_prefix_caching + and self.config.scheduler_config.splitwise_role != "decode" + ): self.cache_manager.update_cache_blocks( request, self.config.cache_config.block_size, request.num_computed_tokens ) @@ -959,7 +969,10 @@ def _allocate_decode_and_extend(): request.need_prefill_tokens = ( request.num_total_tokens ) # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct - if self.config.cache_config.enable_prefix_caching: + if ( + self.config.cache_config.enable_prefix_caching + and self.config.scheduler_config.splitwise_role != "decode" + ): if ( self.cache_manager.num_cpu_blocks > 0 or self.config.cache_config.kvcache_storage_backend @@ -998,7 +1011,10 @@ def _allocate_decode_and_extend(): scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) token_budget -= num_new_tokens request.num_computed_tokens += num_new_tokens - if self.config.cache_config.enable_prefix_caching: + if ( + self.config.cache_config.enable_prefix_caching + and self.config.scheduler_config.splitwise_role != "decode" + ): self.cache_manager.update_cache_blocks( request, self.config.cache_config.block_size, request.num_computed_tokens ) @@ -1257,9 +1273,10 @@ def preallocate_resource_in_p(self, request: Request): ) // self.config.cache_config.block_size + self.config.cache_config.enc_dec_block_num # consider for mtp, plus enc_dec_block_num if self.config.cache_config.enable_prefix_caching: # Enable prefix caching - if self.cache_manager.num_cpu_blocks > 0: + if self.cache_manager.num_cpu_blocks > 0 or self.config.cache_config.kvcache_storage_backend: if not self.cache_manager.can_allocate_gpu_blocks( - need_prealloc_prefill_blocks + (request.need_prefill_tokens + self.config.cache_config.block_size - 1) + // self.config.cache_config.block_size ): # to prevent block allocation for matching in hierarchical cache and cause dead lock return False success = self.get_prefix_cached_blocks(request) @@ -1372,7 +1389,7 @@ def add_prefilled_request(self, request_output: RequestOutput): self.running.append(request) def _free_blocks(self, request: Request): - if self.config.cache_config.enable_prefix_caching: + if self.config.cache_config.enable_prefix_caching and self.config.scheduler_config.splitwise_role != "decode": self.cache_manager.release_block_ids(request) self.cache_manager.recycle_gpu_blocks( request.block_tables[request.num_cached_blocks :], request.request_id @@ -1432,8 +1449,14 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]): del self.req_dict[req_id] # Do not block the main thread here + # Write cache to storage if kvcache_storage_backend is enabled for req in need_postprocess_reqs: - self.cache_manager.write_cache_to_storage(req) + if self.config.scheduler_config.splitwise_role == "decode": + # D instance uses simplified write method (does not rely on Radix Tree) + self.cache_manager.write_cache_to_storage_decode(req) + else: + # P instance / Mixed instance uses standard write method (relies on Radix Tree) + self.cache_manager.write_cache_to_storage(req) with self.lock: for req in need_postprocess_reqs: