diff --git a/.github/scripts/check_lmdeploy.py b/.github/scripts/check_lmdeploy.py
index 04456cba7c..808fe7c4b3 100644
--- a/.github/scripts/check_lmdeploy.py
+++ b/.github/scripts/check_lmdeploy.py
@@ -18,6 +18,8 @@ def check_module_init(root: str):
             continue
         elif d.startswith('lmdeploy/lib'):
             continue
+        elif d.startswith('lmdeploy/monitoring'):
+            continue
         elif d.startswith('lmdeploy/serve/turbomind/triton_models'):
             continue
         elif d.startswith('lmdeploy/serve/turbomind/triton_python_backend'):
diff --git a/docs/en/advance/metrics.md b/docs/en/advance/metrics.md
new file mode 100644
index 0000000000..1f05807d9a
--- /dev/null
+++ b/docs/en/advance/metrics.md
@@ -0,0 +1,179 @@
+# Production Metrics
+
+LMDeploy exposes a set of metrics via Prometheus, and provides visualization via Grafana.
+
+## Setup Guide
+
+This section describes how to set up the monitoring stack (Prometheus + Grafana) provided in the `lmdeploy/monitoring` directory.
+
+## Prerequisites
+
+- [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) installed
+
+- LMDeploy server running with metrics system enabled
+
+## Usage (DP = 1)
+
+1. **Start your LMDeploy server with metrics enabled**
+
+```
+lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics
+```
+
+Replace the model path according to your needs.
+By default, the metrics endpoint will be available at `http://<lmdeploy_server_host>:23333/metrics`.
+
+2. **Navigate to the monitoring directory**
+
+```
+cd lmdeploy/monitoring
+```
+
+3. **Start the monitoring stack**
+
+```
+docker compose up
+```
+
+This command will start Prometheus and Grafana in the background.
+
+4. **Access the monitoring interfaces**
+
+- Prometheus: Open your web browser and go to http://localhost:9090.
+
+- Grafana: Open your web browser and go to http://localhost:3000.
+
+5. **Log in to Grafana**
+
+- Default Username: `admin`
+
+- Default Password: `admin` You will be prompted to change the password upon your first login.
+
+6. **View the Dashboard**
+
+The LMDeploy dashboard is pre-configured and should be available automatically.
+
+## Usage (DP > 1)
+
+1. **Start your LMDeploy server with metrics enabled**
+
+As an example, we use the model `Qwen/Qwen2.5-7B-Instruct` with `DP=2, TP=2`. Start the service as follows:
+
+```bash
+# Proxy server
+lmdeploy serve proxy --server-port 8000 --routing-strategy 'min_expected_latency' --serving-strategy Hybrid --log-level INFO
+
+# API server
+LMDEPLOY_DP_MASTER_ADDR=127.0.0.1 \
+LMDEPLOY_DP_MASTER_PORT=29555 \
+lmdeploy serve api_server \
+    Qwen/Qwen2.5-7B-Instruct \
+    --backend pytorch \
+    --tp 2 \
+    --dp 2 \
+    --proxy-url http://0.0.0.0:8000 \
+    --nnodes 1 \
+    --node-rank 0 \
+    --enable-metrics
+```
+
+You should be able to see multiple API servers added to the proxy server list. Details can be found in `lmdeploy/serve/proxy/proxy_config.json`.
+
+For example, you may have the following API servers:
+
+```
+http://$host_ip:$api_server_port1
+
+http://$host_ip:$api_server_port2
+```
+
+2. **Modify the Prometheus configuration**
+
+When `DP > 1`, LMDeploy will launch one API server for each DP rank. If you want to monitor a specific API server, e.g. `http://$host_ip:$api_server_port1`, modify the configuration file `lmdeploy/monitoring/prometheus.yaml` as follows.
+
+> Note that you should use the actual host machine IP instead of `127.0.0.1` here, since LMDeploy starts the API server using the actual host IP when `DP > 1`
+
+```
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: lmdeploy
+    static_configs:
+      - targets:
+          - '$host_ip:$api_server_port1' # <= Modify this
+```
+
+3. **Navigate to the monitoring folder and perform the same steps as described above**
+
+## Troubleshooting
+
+1. **Port conflicts**
+
+Check if any services are occupying ports `23333` (LMDeploy server port), `9090` (Prometheus port), or `3000` (Grafana port). You can either stop the conflicting running ports or modify the config files as follows:
+
+- Modify LMDeploy server port for Prometheus scrape
+
+In `lmdeploy/monitoring/prometheus.yaml`
+
+```
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: lmdeploy
+    static_configs:
+      - targets:
+          - '127.0.0.1:23333' # <= Modify this LMDeploy server port 23333, need to match the running server port
+```
+
+- Modify Prometheus port
+
+In `lmdeploy/monitoring/grafana/datasources/datasource.yaml`
+
+```
+apiVersion: 1
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://localhost:9090 # <= Modify this Prometheus interface port 9090
+    isDefault: true
+    editable: false
+```
+
+- Modify Grafana port:
+
+In `lmdeploy/monitoring/docker-compose.yaml`, for example, change the port to `3090`
+
+Option 1: Add `GF_SERVER_HTTP_PORT` to the environment section.
+
+```
+  environment:
+- GF_AUTH_ANONYMOUS_ENABLED=true
+- GF_SERVER_HTTP_PORT=3090  # <= Add this line
+```
+
+Option 2: Use port mapping.
+
+```
+grafana:
+  image: grafana/grafana:latest
+  container_name: grafana
+  ports:
+  - "3090:3000"  # <= Host:Container port mapping
+```
+
+2. **No data on the dashboard**
+
+- Create traffic
+
+Try to send some requests to the LMDeploy server to create certain traffic
+
+```
+python3 benchmark/profile_restful_api.py --backend lmdeploy --num-prompts 5000 --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+After refreshing, you should be able to see data on the dashboard.
diff --git a/docs/en/index.rst b/docs/en/index.rst
index ef6fa5a54d..57ba76bdaa 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -104,6 +104,7 @@ Documentation
    advance/structed_output.md
    advance/pytorch_multinodes.md
    advance/pytorch_profiling.md
+   advance/metrics.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/zh_cn/advance/metrics.md b/docs/zh_cn/advance/metrics.md
new file mode 100644
index 0000000000..7eb4eec5cc
--- /dev/null
+++ b/docs/zh_cn/advance/metrics.md
@@ -0,0 +1,176 @@
+# 生产环境指标监控
+
+LMDeploy 通过 Prometheus 暴露监控指标，并通过 Grafana 提供可视化界面。
+
+## 配置指南
+
+本节介绍如何设置 `lmdeploy/monitoring` 目录中提供的监控套件（Prometheus + Grafana）
+
+## 前提条件
+
+- 已安装 [Docker](https://docs.docker.com/engine/install/) 和 [Docker Compose](https://docs.docker.com/compose/install/)
+
+- 已启用指标系统的 LMDeploy 服务正在运行
+
+## 使用说明 (DP = 1)
+
+1. **启动已启用指标的 LMDeploy 服务**
+
+```
+lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics
+```
+
+请根据需求替换模型路径。默认 metrics endpoint 位于 `http://<lmdeploy_server_host>:23333/metrics`。
+
+2. **进入监控目录**
+
+```
+cd lmdeploy/monitoring
+```
+
+3. **启动监控套件**
+
+```
+docker compose up
+```
+
+此命令将在后台启动 Prometheus 和 Grafana。
+
+4. **访问监控界面**
+
+- Prometheus：浏览器访问 http://localhost:9090.
+
+- Grafana：浏览器访问 http://localhost:3000.
+
+5. **登录 Grafana**
+
+- 默认用户名：`admin`
+
+- 默认密码：`admin` （首次登录后会提示修改密码）
+
+6. **查看仪表盘**
+
+预配置的 LMDeploy 仪表盘将自动加载。
+
+## 使用说明 (DP > 1)
+
+1. **启动已启用指标的 LMDeploy 服务**
+
+以模型 `Qwen/Qwen2.5-7B-Instruct` 为例，使用 `DP=2，TP=2` 启动服务：
+
+```bash
+# Proxy server
+lmdeploy serve proxy --server-port 8000 --routing-strategy 'min_expected_latency' --serving-strategy Hybrid --log-level INFO
+
+# API server
+LMDEPLOY_DP_MASTER_ADDR=127.0.0.1 \
+LMDEPLOY_DP_MASTER_PORT=29555 \
+lmdeploy serve api_server \
+    Qwen/Qwen2.5-7B-Instruct \
+    --backend pytorch \
+    --tp 2 \
+    --dp 2 \
+    --proxy-url http://0.0.0.0:8000 \
+    --nnodes 1 \
+    --node-rank 0 \
+    --enable-metrics
+```
+
+您应该能在代理服务器列表中看到多个 API 服务实例。详细信息可以在 `lmdeploy/serve/proxy/proxy_config.json` 中找到。
+
+例如，您可能会看到如下 API 服务地址：
+
+```
+http://$host_ip:$api_server_port1
+
+http://$host_ip:$api_server_port2
+```
+
+2. **修改 Prometheus 配置**
+
+当 DP > 1 时，LMDeploy 会为每个 DP Rank 启动一个 API 服务。如果你想监控其中某个 API 服务，例如：`http://$host_ip:$api_server_port1`，请修改配置文件 `lmdeploy/monitoring/prometheus.yaml` 如下所示。
+
+> 注意：这里应使用实际主机的 IP 地址而非 127.0.0.1，因为当 DP > 1 时，LMDeploy 是通过实际主机 IP 启动 API 服务的。
+
+```
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: lmdeploy
+    static_configs:
+      - targets:
+          - '$host_ip:$api_server_port1' # <= 修改此处
+```
+
+3. **进入监控目录并执行上述相同步骤**
+
+## 故障排除
+
+1. **端口冲突**
+
+检查端口 `23333` (LMDeploy 服务端口)、`9090` (Prometheus 端口) 或 `3000` (Grafana 端口) 是否被占用。解决方案，关闭冲突的端口或如下修改配置文件：
+
+- 修改 Prometheus 抓取的 LMDeploy 服务端口
+
+在 `lmdeploy/monitoring/prometheus.yaml` 中
+
+```
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: lmdeploy
+    static_configs:
+      - targets:
+          - '127.0.0.1:23333' # <= 修改此处的 LMDeploy 服务端口 23333，需与实际运行端口一致
+```
+
+- 修改 Prometheus 端口
+
+在 `lmdeploy/monitoring/grafana/datasources/datasource.yaml` 中
+
+```
+apiVersion: 1
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://localhost:9090 # <= 修改此处的 Prometheus 接口端口 9090
+    isDefault: true
+    editable: false
+```
+
+- 修改 Grafana 端口
+
+在 `lmdeploy/monitoring/docker-compose.yaml` 中操作（例如改为 3090 端口）:
+
+方案一：在环境变量中添加 `GF_SERVER_HTTP_PORT`
+
+```
+  environment:
+- GF_AUTH_ANONYMOUS_ENABLED=true
+- GF_SERVER_HTTP_PORT=3090  # <= 添加此行
+```
+
+方案二：使用端口映射
+
+```
+grafana:
+  image: grafana/grafana:latest
+  container_name: grafana
+  ports:
+  - "3090:3000"  # <= 主机端口:容器端口映射
+```
+
+- **仪表盘无数据**
+
+尝试向 LMDeploy 服务发送请求生成流量：
+
+```
+python3 benchmark/profile_restful_api.py --backend lmdeploy --num-prompts 5000 --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+刷新后仪表盘应显示数据。
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 35b6b74184..db10cbce13 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -105,6 +105,7 @@ LMDeploy 工具箱提供以下核心功能：
    advance/structed_output.md
    advance/pytorch_multinodes.md
    advance/pytorch_profiling.md
+   advance/metrics.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 994ac1ea05..3250e6edc4 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -169,6 +169,7 @@ def add_parser_api_server():
         ArgumentHelper.ep(pt_group)
         ArgumentHelper.enable_microbatch(pt_group)
         ArgumentHelper.enable_eplb(pt_group)
+        ArgumentHelper.enable_metrics(pt_group)
         ArgumentHelper.role(pt_group)
         ArgumentHelper.migration_backend(pt_group)
         # multi-node serving args
@@ -333,6 +334,7 @@ def api_server(args):
                                                  max_prefill_token_num=args.max_prefill_token_num,
                                                  enable_microbatch=args.enable_microbatch,
                                                  enable_eplb=args.enable_eplb,
+                                                 enable_metrics=args.enable_metrics,
                                                  role=EngineRole[args.role],
                                                  migration_backend=MigrationBackend[args.migration_backend],
                                                  model_format=args.model_format)
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index dc5767bea9..5c6ca0b478 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -557,6 +557,11 @@ def enable_eplb(parser):
 
         return parser.add_argument('--enable-eplb', action='store_true', help='enable eplb for specified model')
 
+    @staticmethod
+    def enable_metrics(parser):
+        """Add argument enable_metrics to parser."""
+        parser.add_argument('--enable-metrics', action='store_true', default=False, help='enable metrics system')
+
     # For Disaggregation
     @staticmethod
     def role(parser):
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index d067d2f7ce..f8efcff8a2 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
+import time
 from dataclasses import dataclass, field
 from typing import Callable, Dict, List, Literal, Optional
 
@@ -314,6 +315,7 @@ class PytorchEngineConfig:
             it to True if you want to update weights after create the pipeline
         enable_microbatch (bool): enable microbatch for specified model
         enable_eplb (bool): enable eplb for specified model
+        enable_metrics (bool): enable metrics system
         role (EngineRole): role of engin, options: ['Hybrid', 'Prefill',
             'Decode']. Default to `EngineRole.Hybrid`.
         migration_backend: migration backend. options: ['DLSlime'].
@@ -349,6 +351,7 @@ class PytorchEngineConfig:
     enable_eplb: bool = False
     enable_mp_engine: bool = False
     model_format: str = None
+    enable_metrics: bool = False
 
     role: EngineRole = EngineRole.Hybrid
     migration_backend: MigrationBackend = MigrationBackend.DLSlime
@@ -422,6 +425,45 @@ class Response:
     index: int = 0
 
 
+# copy from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py
+class EngineCoreEventType(enum.IntEnum):
+    """The type of engine core request event.
+
+    QUEUED - when the request was received by the engine core and added to the scheduler queue
+    SCHEDULED - when the request was first scheduled for execution
+    PREEMPTED - the request has been put back in the waiting queue in order to make room for other requests to complete.
+                It will be re-scheduled in future and re-start its prefill phase
+    """
+    QUEUED = 1
+    SCHEDULED = 2
+    PREEMPTED = 3  # FIXME, currently ignored for simplicity
+
+
+# copy from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py
+@dataclass
+class EngineCoreEvent():
+    """A timestamped engine core event associated with a request.
+
+    The timestamp is a monotonic timestamps and is used for by the engine frontend to calculate intervals between engine
+    core events. These timestamps should not be compared with timestamps from other processes.
+    """
+    type: EngineCoreEventType
+    timestamp: float
+
+    @classmethod
+    def new_event(cls, event_type: EngineCoreEventType, timestamp: Optional[float] = None) -> 'EngineCoreEvent':
+        timestamp = time.perf_counter() if timestamp is None else timestamp
+        return cls(event_type, timestamp)
+
+
+@dataclass
+class MetricsInfo:
+    """Metrics info from the inference engine."""
+    engine_core_timestamp: float = 0.0
+    engine_core_events: List[EngineCoreEvent] = field(default_factory=list)
+    scheduler_raw_info: dict = field(default_factory=dict)
+
+
 @dataclass
 class EngineOutput:
     """Engine output for turbomind/pytorch engine.
@@ -435,6 +477,7 @@ class EngineOutput:
             position.
         cache_block_ids (List[int]): send cache blocks back for migration in
             Disaggregated LLM Serving when Prefill Engine is Done.
+        metrics_info (MetricsInfo): metrics info from the inference engine.
     """
     status: ResponseType
     token_ids: List[int]
@@ -444,6 +487,7 @@ class EngineOutput:
     last_hidden_state: torch.Tensor = None
 
     cache_block_ids: Optional[List[int]] = None
+    metrics_info: Optional[MetricsInfo] = None
 
 
 @dataclass
diff --git a/lmdeploy/metrics/__init__.py b/lmdeploy/metrics/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/lmdeploy/metrics/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
new file mode 100644
index 0000000000..226b633d66
--- /dev/null
+++ b/lmdeploy/metrics/loggers.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from: https://github.com/vllm-project/vllm/blob/main/vllm/v1/metrics/loggers.py
+
+import time
+from abc import ABC, abstractmethod
+from datetime import datetime
+from typing import List, Optional
+
+import numpy as np
+
+from lmdeploy.metrics.stats import IterationStats, SchedulerStats
+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')
+
+
+class StatLoggerBase(ABC):
+
+    @abstractmethod
+    def record(self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]):
+        ...
+
+    def log(self):  # noqa
+        pass
+
+
+class LoggingStatLogger(StatLoggerBase):
+
+    def __init__(self, dp_rank: int = 0):
+        self.dp_rank = dp_rank
+        self._reset(time.perf_counter())
+        self.last_scheduler_stats = SchedulerStats()
+
+    def _reset(self, now):
+        self.last_log_time = now
+
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
+
+    def _track_iteration_stats(self, iteration_stats: IterationStats):
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
+        self.num_generation_tokens.append(iteration_stats.num_generation_tokens)
+
+    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+        # Compute summary metrics for tracked stats
+        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+
+    def record(self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]):
+        """Log Stats to standard output."""
+
+        if iteration_stats:
+            self._track_iteration_stats(iteration_stats)
+
+        self.last_scheduler_stats = scheduler_stats
+
+    def log(self):
+        now = time.perf_counter()
+        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
+        generation_throughput = self._get_throughput(self.num_generation_tokens, now)
+
+        self._reset(now)
+
+        scheduler_stats = self.last_scheduler_stats
+
+        # Format and print output.
+        log_msg = (f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} "
+                   f'DP{self.dp_rank}] '
+                   f'Avg prompt throughput: {prompt_throughput:.1f} tokens/s, '
+                   f'Avg generation throughput: {generation_throughput:.1f} tokens/s, '
+                   f'Finished: {scheduler_stats.num_finished_reqs} reqs, '
+                   f'Unfinished: {scheduler_stats.num_total_reqs-scheduler_stats.num_finished_reqs} reqs, '
+                   f'Running: {scheduler_stats.num_running_reqs} reqs, '
+                   f'Waiting: {scheduler_stats.num_waiting_reqs} reqs, '
+                   f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%')
+        print(log_msg)
+
+
+class PrometheusStatLogger(StatLoggerBase):
+
+    def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
+        try:
+            import prometheus_client
+            prometheus_client.disable_created_metrics()  # disable noisy creation timestamp gauge in prometheus
+        except ImportError:
+            raise ImportError(
+                'To use metrics system , please install prometheus_client by `pip install prometheus_client`')
+
+        self.dp_rank = dp_rank
+
+        # Unregister any existing lmdeploy collectors
+        for collector in list(prometheus_client.REGISTRY._collector_to_names):
+            if hasattr(collector, '_name') and 'lmdeploy' in collector._name:
+                prometheus_client.REGISTRY.unregister(collector)
+
+        # Config information
+        self.info_backend_config = prometheus_client.Info(name='lmdeploy:backend_config',
+                                                          documentation='information of backend_config')
+
+        labelnames = ['model_name', 'engine']
+        labelvalues = [model_name, str(dp_rank)]
+
+        #
+        # Scheduler state
+        #
+        self.gauge_scheduler_finished = prometheus_client.Gauge(name='lmdeploy:num_requests_finished',
+                                                                documentation='Number of current finished requests.',
+                                                                labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_scheduler_unfinished = prometheus_client.Gauge(
+            name='lmdeploy:num_requests_unfinished',
+            documentation='Number of current unfinished requests.',
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_scheduler_running = prometheus_client.Gauge(
+            name='lmdeploy:num_requests_running',
+            documentation='Number of requests in model execution batches.',
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_scheduler_waiting = prometheus_client.Gauge(
+            name='lmdeploy:num_requests_waiting',
+            documentation='Number of requests waiting to be processed.',
+            labelnames=labelnames).labels(*labelvalues)
+
+        #
+        # GPU cache
+        #
+        self.gauge_gpu_cache_usage = prometheus_client.Gauge(
+            name='lmdeploy:gpu_cache_usage_perc',
+            documentation='GPU KV-cache usage. 1 means 100 percent usage.',
+            labelnames=labelnames).labels(*labelvalues)
+
+        #
+        # Counters
+        #
+        self.counter_prompt_tokens = prometheus_client.Counter(name='lmdeploy:prompt_tokens_total',
+                                                               documentation='Number of prefill tokens processed.',
+                                                               labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_generation_tokens = prometheus_client.Counter(
+            name='lmdeploy:generation_tokens_total',
+            documentation='Number of generation tokens processed.',
+            labelnames=labelnames).labels(*labelvalues)
+
+        from lmdeploy.messages import ResponseType
+        self.counter_request_success: dict[ResponseType, prometheus_client.Counter] = {}
+        counter_request_success_base = prometheus_client.Counter(
+            name='lmdeploy:request_success_total',
+            documentation='Count of successfully processed requests.',
+            labelnames=labelnames + ['finished_reason'])
+        for reason in ResponseType:
+            self.counter_request_success[reason] = counter_request_success_base.labels(*(labelvalues + [str(reason)]))
+
+        #
+        # Histograms of counts
+        #
+        self.histogram_num_prompt_tokens_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_prompt_tokens',
+                documentation='Number of prefill tokens processed.',
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_num_generation_tokens_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_generation_tokens',
+                documentation='Number of generation tokens processed.',
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_iteration_tokens = \
+            prometheus_client.Histogram(
+                name='lmdeploy:iteration_tokens_total',
+                documentation='Histogram of number of tokens per engine_step.',
+                buckets=[
+                    1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192,
+                    16384
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
+        #
+        # Histogram of timing intervals
+        #
+        self.histogram_time_to_first_token = \
+            prometheus_client.Histogram(
+                name='lmdeploy:time_to_first_token_seconds',
+                documentation='Histogram of time to first token in seconds.',
+                buckets=[
+                    0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0,
+                    640.0, 2560.0
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_time_per_output_token = \
+            prometheus_client.Histogram(
+                name='lmdeploy:time_per_output_token_seconds',
+                documentation='Histogram of time per output token in seconds.',
+                buckets=[
+                    0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+                ],
+                labelnames=labelnames).labels(*labelvalues)
+
+        request_latency_buckets = [
+            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0,
+            960.0, 1920.0, 7680.0
+        ]
+        self.histogram_e2e_time_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:e2e_request_latency_seconds',
+                documentation='Histogram of e2e request latency in seconds.',
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_queue_time_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_queue_time_seconds',
+                documentation='Histogram of time spent in WAITING phase for request.',
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_inference_time_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_inference_time_seconds',
+                documentation='Histogram of time spent in RUNNING phase for request.',
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_prefill_time_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_prefill_time_seconds',
+                documentation='Histogram of time spent in PREFILL phase for request.',
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_decode_time_request = \
+            prometheus_client.Histogram(
+                name='lmdeploy:request_decode_time_seconds',
+                documentation='Histogram of time spent in DECODE phase for request.',
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+
+    def record(self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]):
+        """Log to prometheus."""
+
+        self.gauge_scheduler_finished.set(scheduler_stats.num_finished_reqs)
+        self.gauge_scheduler_unfinished.set(scheduler_stats.num_total_reqs - scheduler_stats.num_finished_reqs)
+        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
+        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
+        self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
+
+        if iteration_stats is None:
+            return
+
+        self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
+        self.counter_generation_tokens.inc(iteration_stats.num_generation_tokens)
+        self.histogram_iteration_tokens.observe(iteration_stats.num_prompt_tokens +
+                                                iteration_stats.num_generation_tokens)
+
+        for ttft in iteration_stats.time_to_first_tokens_iter:
+            self.histogram_time_to_first_token.observe(ttft)
+
+        for tpot in iteration_stats.time_per_output_tokens_iter:
+            self.histogram_time_per_output_token.observe(tpot)
+
+        for finished_request in iteration_stats.finished_requests:
+            self.counter_request_success[finished_request.finish_reason].inc()
+            self.histogram_e2e_time_request.observe(finished_request.e2e_latency)
+            self.histogram_queue_time_request.observe(finished_request.queued_time)
+            self.histogram_prefill_time_request.observe(finished_request.prefill_time)
+            self.histogram_inference_time_request.observe(finished_request.inference_time)
+            self.histogram_decode_time_request.observe(finished_request.decode_time)
+            self.histogram_num_prompt_tokens_request.observe(finished_request.num_prompt_tokens)
+            self.histogram_num_generation_tokens_request.observe(finished_request.num_generation_tokens)
+
+
+def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+    """Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values until the value exceeds the specified maximum."""
+    exponent = 0
+    buckets: List[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+
+
+def build_1_2_5_buckets(max_value: int) -> List[int]:
+    """
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    return build_buckets([1, 2, 5], max_value)
diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py
new file mode 100644
index 0000000000..c0b4183e51
--- /dev/null
+++ b/lmdeploy/metrics/metrics_processor.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from lmdeploy.utils import get_logger
+
+from .stats import IterationStats, RequestState, SchedulerStats
+
+if TYPE_CHECKING:
+    from lmdeploy.messages import EngineOutput
+
+logger = get_logger('lmdeploy')
+
+
+@dataclass
+class MetricsContext:
+    enable_metrics: bool = False
+    scheduler_stats: SchedulerStats = SchedulerStats()
+
+
+class MetricsManager:
+
+    def __init__(self):
+        """Initialize metrics manager."""
+        self._current_ctx = MetricsContext()
+
+    def set_context(self, ctx: MetricsContext):
+        """Set metrics context."""
+        self._current_ctx = ctx
+
+    def get_context(self):
+        """Get current context."""
+        return self._current_ctx
+
+    @contextmanager
+    def context(self, ctx: MetricsContext):
+        """Context manager."""
+        old_ctx = self.get_context()
+        self.set_context(ctx)
+        try:
+            yield ctx
+        finally:
+            self.set_context(old_ctx)
+
+
+_METRICS_MANAGER = None
+
+
+def get_metrics_manager():
+    global _METRICS_MANAGER
+    if _METRICS_MANAGER is None:
+        _METRICS_MANAGER = MetricsManager()
+
+    return _METRICS_MANAGER
+
+
+# Metrics getters
+def is_metrics_enabled():
+    return get_metrics_manager().get_context().enable_metrics
+
+
+def get_current_metrics_context():
+    return get_metrics_manager().get_context()
+
+
+def get_current_scheduler_stats():
+    return get_metrics_manager().get_context().scheduler_stats
+
+
+# Metrics setters
+def set_metrics_enabled_flag(enable_metrics: bool):
+    """Set metrics enabled flag."""
+    ctx = get_current_metrics_context()
+    ctx.enable_metrics = enable_metrics
+
+    if enable_metrics:
+        logger.info('Metrics are enabled.')
+
+
+def increment_async_engine_scheduler_stats_total_req():
+    """Set scheduler stats in async engine."""
+    get_current_scheduler_stats().num_total_reqs += 1
+
+
+def increment_async_engine_scheduler_stats_finished_req():
+    """Set scheduler stats in async engine."""
+    get_current_scheduler_stats().num_finished_reqs += 1
+
+
+# Metrics processor
+class MetricsProcessor():
+    """Metrics processor."""
+
+    def __init__(self):
+        self.metrics_queue: asyncio.Queue = None
+        self.metrics_handler: asyncio.Task = None
+
+    def start_metrics_handler(self, enable_metrics: bool):
+        set_metrics_enabled_flag(enable_metrics)
+
+        if enable_metrics and self.metrics_handler is None:
+            self.metrics_queue = asyncio.Queue()
+            self.metrics_handler = asyncio.create_task(self._run_metrics_handler())
+            logger.info('Metrics handler task started.')
+
+    async def stop_metrics_handler(self):
+        if self.metrics_handler is not None:
+            self.metrics_handler.cancel()
+            try:
+                await self.metrics_handler
+            except asyncio.CancelledError:
+                pass  # Expected cancellation
+            finally:
+                self.metrics_handler = None
+                logger.info('Metrics handler task stopped.')
+
+    async def _run_metrics_handler(self):
+        """A background task that consumes and processes metrics data."""
+        while True:
+            try:
+                # fetch
+                update_data = await self.metrics_queue.get()
+                input_len, prev_len, outputs, req_state, iteration_stats = update_data
+
+                # compute
+                self._update_stats(input_len, prev_len, outputs, req_state, iteration_stats)
+
+                # record
+                scheduler_stats = get_current_scheduler_stats()
+                for stat_logger in self.stat_loggers:
+                    stat_logger.record(scheduler_stats=scheduler_stats, iteration_stats=iteration_stats)
+
+                self.metrics_queue.task_done()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.exception(f'Metrics handler background task failed: {e}')
+
+    def queue_update(self, update_data: tuple):
+        if not is_metrics_enabled() or self.metrics_queue is None:
+            return
+
+        self.metrics_queue.put_nowait(update_data)
+
+    def increment_total_requests(self):
+        increment_async_engine_scheduler_stats_total_req()
+
+    def increment_finished_requests(self):
+        increment_async_engine_scheduler_stats_finished_req()
+
+    def _update_stats(self, input_len: int, prev_len: int, outputs: 'EngineOutput', req_state: RequestState,
+                      iteration_stats: IterationStats):
+        from lmdeploy.messages import ResponseType
+
+        status = outputs.status
+        metrics_info = outputs.metrics_info
+        scheduler_raw_info = metrics_info.scheduler_raw_info
+
+        # update scheduler stats
+        scheduler_stats = get_current_scheduler_stats()
+        # actual running requests
+        scheduler_stats.num_running_reqs = scheduler_raw_info['locked']
+        # waiting to be scheduled + scheduled to running but haven't started yet
+        scheduler_stats.num_waiting_reqs = scheduler_raw_info['waiting'] + scheduler_raw_info['running']
+        scheduler_stats.gpu_cache_usage = 1.0 - (scheduler_raw_info['free_gpu_blocks'] /
+                                                 scheduler_raw_info['total_gpu_blocks'])
+
+        # update from per-iteration outputs
+        iteration_stats.update_from_output(engine_core_timestamp=metrics_info.engine_core_timestamp,
+                                           engine_core_events=metrics_info.engine_core_events,
+                                           num_prompt_tokens=input_len,
+                                           num_new_generation_tokens=(outputs.num_token - prev_len),
+                                           is_prefilling=(prev_len == 0),
+                                           req_stats=req_state.stats)
+
+        # update from finished request
+        if status is ResponseType.FINISH:
+            iteration_stats.update_from_finished_request(finish_reason=status,
+                                                         num_prompt_tokens=input_len,
+                                                         req_stats=req_state.stats)
+
+        req_state.is_prefilling = False  # change to decode after first update
+
+
+metrics_processor = MetricsProcessor()
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
new file mode 100644
index 0000000000..eba75da8d8
--- /dev/null
+++ b/lmdeploy/metrics/stats.py
@@ -0,0 +1,214 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/v1/metrics/stats.py
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+
+if TYPE_CHECKING:
+    from lmdeploy.messages import EngineCoreEvent, ResponseType
+
+
+@dataclass
+class SchedulerStats:
+    """Stats associated with the scheduler."""
+
+    # total / finished reqs in async engine
+    num_total_reqs: int = 0
+    num_finished_reqs: int = 0
+
+    # running / waiting reqs in PyTorch engine
+    num_running_reqs: int = 0
+    num_waiting_reqs: int = 0
+
+    # GPU cache usage in PyTorch engine
+    gpu_cache_usage: float = 0.0
+
+    def __repr__(self):
+        """Return a human-readable string representation."""
+        return ('SchedulerStats(\n'
+                f'  num_total_reqs={self.num_total_reqs},\n'
+                f'  num_finished_reqs={self.num_finished_reqs},\n'
+                f'  num_running_reqs={self.num_running_reqs},\n'
+                f'  num_waiting_reqs={self.num_waiting_reqs},\n'
+                f'  gpu_cache_usage={self.gpu_cache_usage:.6f},\n'
+                ')')
+
+
+class RequestState:
+    """State of a request."""
+
+    def __init__(self, arrival_time: float = None, prompt_len: int = 0, is_prefilling: Optional[bool] = True):
+        self.arrival_time = time.perf_counter() if arrival_time is None else arrival_time
+        self.prompt_len = prompt_len
+        self.is_prefilling = is_prefilling
+        self.stats = RequestStateStats(arrival_time=self.arrival_time)
+
+    def __repr__(self):
+        """Return a human-readable string representation."""
+        return ('RequestState(\n'
+                f'  arrival_time={self.arrival_time:.6f},\n'
+                f'  prompt_len={self.prompt_len},\n'
+                f'  is_prefilling={self.is_prefilling},\n'
+                f'  stats.num_generation_tokens={self.stats.num_generation_tokens},\n'
+                f'  stats.arrival_time={self.stats.arrival_time:.6f},\n'
+                f'  stats.queued_ts={self.stats.queued_ts:.6f},\n'
+                f'  stats.scheduled_ts={self.stats.scheduled_ts:.6f},\n'
+                f'  tats.first_token_ts={self.stats.first_token_ts:.6f},\n'
+                f'  stats.last_token_ts={self.stats.last_token_ts:.6f},\n'
+                ')')
+
+
+@dataclass
+class RequestStateStats:
+    """Stats that need to be tracked across delta updates."""
+
+    num_generation_tokens: int = 0
+
+    # This is a engine frontend timestamp
+    arrival_time: float = 0.0
+
+    # These are engine core timestamps
+    queued_ts: float = 0.0
+    scheduled_ts: float = 0.0
+    first_token_ts: float = 0.0
+    last_token_ts: float = 0.0
+
+    def __repr__(self):
+        """Return a human-readable string representation."""
+        return ('RequestStateStats(\n'
+                f'  num_generation_tokens={self.num_generation_tokens},\n'
+                f'  arrival_time={self.arrival_time:.6f},\n'
+                f'  queued_ts={self.queued_ts:.6f},\n'
+                f'  scheduled_ts={self.scheduled_ts:.6f},\n'
+                f'  first_token_ts={self.first_token_ts:.6f},\n'
+                f'  last_token_ts={self.last_token_ts:.6f}\n'
+                ')')
+
+
+@dataclass
+class FinishedRequestStats:
+    """Stats associated with a finished request."""
+
+    finish_reason: 'ResponseType'
+    e2e_latency: float = 0.0
+    num_prompt_tokens: int = 0
+    num_generation_tokens: int = 0
+    queued_time: float = 0.0
+    prefill_time: float = 0.0
+    inference_time: float = 0.0
+    decode_time: float = 0.0
+
+    def __repr__(self):
+        """Return a human-readable string representation."""
+        return ('FinishedRequestStats(\n'
+                f'  finish_reason={self.finish_reason},\n'
+                f'  e2e_latency={self.e2e_latency:.6f},\n'
+                f'  num_prompt_tokens={self.num_prompt_tokens},\n'
+                f'  num_generation_tokens={self.num_generation_tokens},\n'
+                f'  queued_time={self.queued_time:.6f},\n'
+                f'  prefill_time={self.prefill_time:.6f},\n'
+                f'  inference_time={self.inference_time:.6f},\n'
+                f'  decode_time={self.decode_time:.6f}\n'
+                ')')
+
+
+class IterationStats:
+    """Stats associated with a single set of EngineCoreOutputs."""
+
+    def __init__(self):
+        self.iteration_timestamp = time.perf_counter()
+        self.num_generation_tokens = 0
+        self.num_prompt_tokens = 0
+        self.finished_requests: list[FinishedRequestStats] = []
+        self.time_to_first_tokens_iter: list[float] = []
+        self.time_per_output_tokens_iter: list[float] = []
+
+    def __repr__(self):
+        """Return a human-readable string representation."""
+        return ('IterationStats(\n'
+                f'  iteration_timestamp={self.iteration_timestamp:.6f},\n'
+                f'  num_generation_tokens={self.num_generation_tokens},\n'
+                f'  num_prompt_tokens={self.num_prompt_tokens},\n'
+                f'  finished_requests_count={len(self.finished_requests)},\n'
+                f'  time_to_first_tokens_iter={self.time_to_first_tokens_iter},\n'
+                f'  time_per_output_tokens_iter={self.time_per_output_tokens_iter},\n'
+                ')')
+
+    def _time_since(self, start: float) -> float:
+        """Calculate an interval relative to this iteration's timestamp."""
+        return self.iteration_timestamp - start
+
+    def update_from_output(self, engine_core_timestamp: float, engine_core_events: List['EngineCoreEvent'],
+                           num_prompt_tokens: int, num_new_generation_tokens: int, is_prefilling: bool,
+                           req_stats: RequestStateStats):
+
+        self.num_generation_tokens += num_new_generation_tokens
+        if is_prefilling:
+            assert num_new_generation_tokens > 0
+            self.num_prompt_tokens += num_prompt_tokens
+
+            first_token_latency = self._time_since(req_stats.arrival_time)
+            assert first_token_latency > 0.0, f'TTFT cannot be negative: {first_token_latency:.6f}'
+            self.time_to_first_tokens_iter.append(first_token_latency)
+
+        req_stats.num_generation_tokens += num_new_generation_tokens
+
+        # Process request-level engine core events
+        if engine_core_events is not None:
+            self.update_from_events(engine_core_events, req_stats)
+
+        # Process the batch-level "new tokens" engine core event
+        if is_prefilling:
+            req_stats.first_token_ts = engine_core_timestamp
+        else:
+            tpot = engine_core_timestamp - req_stats.last_token_ts
+            assert tpot > 0.0, f'TPOT cannot be negative: {tpot:.6f}'
+            self.time_per_output_tokens_iter.append(tpot)
+
+        req_stats.last_token_ts = engine_core_timestamp
+
+    def update_from_events(self, engine_core_events: List['EngineCoreEvent'], req_stats: RequestStateStats):
+        # Avoid circular dependency
+        from lmdeploy.messages import EngineCoreEventType
+
+        for event in engine_core_events:
+            if event.type == EngineCoreEventType.QUEUED:
+                req_stats.queued_ts = event.timestamp
+            elif event.type == EngineCoreEventType.SCHEDULED:
+                if req_stats.scheduled_ts == 0.0:  # ignore preemptions
+                    req_stats.scheduled_ts = event.timestamp
+            # FIXME: deal with preempted case
+            # elif event.type == EngineCoreEventType.PREEMPTED:
+            #     self.num_preempted_reqs += 1
+
+    def update_from_finished_request(self, finish_reason: 'ResponseType', num_prompt_tokens: int,
+                                     req_stats: RequestStateStats):
+
+        e2e_latency = self._time_since(req_stats.arrival_time)
+
+        # Queued interval is from first QUEUED event to first SCHEDULED
+        queued_time = req_stats.scheduled_ts - req_stats.queued_ts
+
+        # Prefill interval is from first SCHEDULED to first NEW_TOKEN
+        # Any preemptions during prefill is included in the interval
+        prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts
+
+        # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
+        # Any preemptions during decode are included
+        decode_time = req_stats.last_token_ts - req_stats.first_token_ts
+
+        # Inference interval is from first SCHEDULED to last NEW_TOKEN
+        # Any preemptions during prefill or decode are included
+        inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
+
+        finished_req = \
+            FinishedRequestStats(finish_reason=finish_reason,
+                                 e2e_latency=e2e_latency,
+                                 num_prompt_tokens=num_prompt_tokens,
+                                 num_generation_tokens=req_stats.num_generation_tokens,
+                                 queued_time=queued_time,
+                                 prefill_time=prefill_time,
+                                 inference_time=inference_time,
+                                 decode_time=decode_time)
+        self.finished_requests.append(finished_req)
diff --git a/lmdeploy/monitoring/docker-compose.yaml b/lmdeploy/monitoring/docker-compose.yaml
new file mode 100644
index 0000000000..bd7e0e4817
--- /dev/null
+++ b/lmdeploy/monitoring/docker-compose.yaml
@@ -0,0 +1,29 @@
+# copy from https://github.com/sgl-project/sglang/blob/main/examples/monitoring/docker-compose.yaml
+version: '3'
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: prometheus
+    network_mode: host
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yml
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+
+  grafana:
+    image: grafana/grafana:latest
+    container_name: grafana
+    network_mode: host
+    volumes:
+      - ./grafana/datasources:/etc/grafana/provisioning/datasources
+      - ./grafana/dashboards/config:/etc/grafana/provisioning/dashboards
+      - ./grafana/dashboards/json:/var/lib/grafana/dashboards
+    environment:
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
+      - GF_AUTH_BASIC_ENABLED=false
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/lmdeploy-dashboard.json
+    depends_on:
+      - prometheus
diff --git a/lmdeploy/monitoring/grafana/dashboards/config/dashboard.yaml b/lmdeploy/monitoring/grafana/dashboards/config/dashboard.yaml
new file mode 100644
index 0000000000..ea1a52325e
--- /dev/null
+++ b/lmdeploy/monitoring/grafana/dashboards/config/dashboard.yaml
@@ -0,0 +1,11 @@
+apiVersion: 1
+providers:
+  - name: 'LMDeploy'
+    orgId: 1
+    folder: 'LMDeploy Monitoring'
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: false
+    options:
+      path: /var/lib/grafana/dashboards
diff --git a/lmdeploy/monitoring/grafana/dashboards/json/lmdeploy-dashboard.json b/lmdeploy/monitoring/grafana/dashboards/json/lmdeploy-dashboard.json
new file mode 100644
index 0000000000..7f29f92af7
--- /dev/null
+++ b/lmdeploy/monitoring/grafana/dashboards/json/lmdeploy-dashboard.json
@@ -0,0 +1,1163 @@
+{
+  "_comment": "json file adapted from https://github.com/vllm-project/vllm/blob/main/examples/online_serving/prometheus_grafana/grafana.json",
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Monitoring LMDeploy Inference Server",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 1,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "End to end request latency measured in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 9,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(lmdeploy:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(lmdeploy:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "E2E Request Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of tokens processed per second",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(lmdeploy:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Prompt Tokens/Sec",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(lmdeploy:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Generation Tokens/Sec",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Token Throughput",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Inter token latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 10,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(lmdeploy:time_per_output_token_seconds_bucket_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(lmdeploy:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Mean",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time Per Output Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "lmdeploy:num_requests_finished{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Finished",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "lmdeploy:num_requests_unfinished{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Unfinished",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "lmdeploy:num_requests_running{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Running",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "lmdeploy:num_requests_waiting{model_name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Waiting",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        }
+      ],
+      "title": "Scheduler State",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(lmdeploy:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(lmdeploy:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time To First Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Percentage of used cache blocks by LMDeploy.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 4,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "lmdeploy:gpu_cache_usage_perc{model_name=\"$model_name\"}",
+          "instant": false,
+          "legendFormat": "GPU Cache Usage",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Cache Utilization",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 32
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(finished_reason) (increase(lmdeploy:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Finish Reason",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "seconds",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 32
+      },
+      "id": 14,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(lmdeploy:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Queue Time",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "",
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "prometheus",
+          "value": "edx8memhpd9tsa"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct",
+          "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "label_values(model_name)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "model_name",
+        "multi": false,
+        "name": "model_name",
+        "options": [],
+        "query": {
+          "query": "label_values(model_name)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "LMDeploy",
+  "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
+  "version": 8,
+  "weekStart": ""
+}
diff --git a/lmdeploy/monitoring/grafana/datasources/datasource.yaml b/lmdeploy/monitoring/grafana/datasources/datasource.yaml
new file mode 100644
index 0000000000..1ab0e4a5fd
--- /dev/null
+++ b/lmdeploy/monitoring/grafana/datasources/datasource.yaml
@@ -0,0 +1,8 @@
+apiVersion: 1
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://localhost:9090
+    isDefault: true
+    editable: false
diff --git a/lmdeploy/monitoring/prometheus.yaml b/lmdeploy/monitoring/prometheus.yaml
new file mode 100644
index 0000000000..3307d2bd4c
--- /dev/null
+++ b/lmdeploy/monitoring/prometheus.yaml
@@ -0,0 +1,10 @@
+# prometheus.yaml
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: lmdeploy
+    static_configs:
+      - targets:
+          - '127.0.0.1:23333'
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index bba4c8276a..9fd87647f7 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -9,7 +9,7 @@
 import numpy as np
 import torch
 
-from lmdeploy.messages import PytorchEngineConfig, ResponseType
+from lmdeploy.messages import MetricsInfo, PytorchEngineConfig, ResponseType
 from lmdeploy.pytorch.disagg.config import EngineRole
 from lmdeploy.pytorch.disagg.messages import MigrationExecutionBatch
 from lmdeploy.utils import get_logger, get_max_batch_size, get_model, logging_timer
@@ -46,6 +46,9 @@ class InferOutput:
     # when Prefill Engine is Done.
     cache_block_ids: List[int] = None
 
+    # for logging
+    metrics_info: MetricsInfo = None
+
 
 def _tensorlize_block_offsets(block_offsets, dtype=torch.int32):
     """Tensorlize block_offsets."""
@@ -771,8 +774,8 @@ def update_running_migration(self, running: SeqList, next_token_ids: np.ndarray,
                 msg.update_token_ids(update_token, model_meta=model_meta)
                 msg.status = MessageStatus.STOPPED
 
-    def _make_infer_outputs(self, next_token_ids: torch.LongTensor, running: SeqList, logits: torch.Tensor,
-                            stopped: torch.Tensor, model_metas: List[Dict[str, Any]]):
+    def _make_infer_outputs(self, new_token_timestamp: float, next_token_ids: torch.LongTensor, running: SeqList,
+                            logits: torch.Tensor, stopped: torch.Tensor, model_metas: List[Dict[str, Any]]):
         """Make infer output."""
 
         seq_length = [seq.num_token_ids for seq in running]
@@ -794,11 +797,13 @@ def _make_infer_outputs(self, next_token_ids: torch.LongTensor, running: SeqList
                 cache_block_ids = self.scheduler.block_manager.get_block_table(msg).tolist()
             else:
                 cache_block_ids = None
+            metrics_info = MetricsInfo(new_token_timestamp, msg.engine_core_events, self.scheduler.make_stats())
             out = InferOutput(session_id=session_id,
                               resp=msg.resp,
                               finish=finish,
                               token_ids=token_ids,
-                              cache_block_ids=cache_block_ids)
+                              cache_block_ids=cache_block_ids,
+                              metrics_info=metrics_info)
             outputs[session_id] = out
 
             if msg.return_logits:
@@ -932,7 +937,10 @@ def __send_resp(out: InferOutput):
             resp_type = (ResponseType.FINISH if out.finish else ResponseType.SUCCESS)
             self._response(out.resp,
                            resp_type,
-                           data=dict(token_ids=out.token_ids, logits=out.logits, cache_block_ids=out.cache_block_ids))
+                           data=dict(token_ids=out.token_ids,
+                                     logits=out.logits,
+                                     cache_block_ids=out.cache_block_ids,
+                                     metrics_info=out.metrics_info))
 
         def __send_resps(step_outputs: List[InferOutput]):
             """Send response callback."""
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 4a10d08480..1a52357f5c 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -147,19 +147,29 @@ async def async_stream_infer(self,
         while True:
             resp = await self.req_sender.async_recv(resp)
 
-            cache_block_ids = resp.data.get('cache_block_ids', None)
+            cache_block_ids = resp.data.get('cache_block_ids', None) if resp.data else None
+            metrics_info = resp.data.get('metrics_info', None) if resp.data else None
             if resp.type == ResponseType.SUCCESS:
                 token_ids = resp.data['token_ids'].tolist()
                 num_ids = len(token_ids)
                 logger.debug(f'session[{session_id}] success: num_out_ids={num_ids}.')
-                yield EngineOutput(resp.type, token_ids, num_ids, cache_block_ids=cache_block_ids)
+                yield EngineOutput(resp.type,
+                                   token_ids,
+                                   num_ids,
+                                   cache_block_ids=cache_block_ids,
+                                   metrics_info=metrics_info)
             elif resp.type == ResponseType.FINISH:
                 resp_data = resp.data
                 token_ids = resp_data['token_ids'].tolist()
                 logits = resp_data['logits']
                 num_ids = len(token_ids)
                 logger.debug(f'session[{session_id}] finish: num_out_ids={num_ids}.')
-                yield EngineOutput(resp.type, token_ids, num_ids, logits=logits, cache_block_ids=cache_block_ids)
+                yield EngineOutput(resp.type,
+                                   token_ids,
+                                   num_ids,
+                                   logits=logits,
+                                   cache_block_ids=cache_block_ids,
+                                   metrics_info=metrics_info)
                 break
             else:
                 logger.debug(f'session[{session_id}] failed.')
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 233425b37e..49614d521f 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -2,6 +2,7 @@
 import asyncio
 import base64
 import functools
+import time
 from contextlib import asynccontextmanager, contextmanager
 from multiprocessing.reduction import ForkingPickler
 from typing import Any, Dict
@@ -776,6 +777,7 @@ async def get_output_async(self):
         with torch.cuda.stream(self.out_stream), torch.inference_mode(), record_function('outputs_D2H'):
             out['next_token_ids'] = out['next_token_ids'].cpu()
             out['stopped'] = out['stopped'].cpu()
+            out['new_token_timestamp'] = time.perf_counter()
             if out['logits'] is not None:
                 out['logits'] = out['logits'].cpu()
         return out
diff --git a/lmdeploy/pytorch/engine/request.py b/lmdeploy/pytorch/engine/request.py
index b39519d970..186daac9b7 100644
--- a/lmdeploy/pytorch/engine/request.py
+++ b/lmdeploy/pytorch/engine/request.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Awaitable, Callable, Dict, List
 
-from lmdeploy.messages import ResponseType
+from lmdeploy.messages import MetricsInfo, ResponseType
 from lmdeploy.utils import get_logger
 
 logger = get_logger('lmdeploy')
@@ -31,6 +31,7 @@ class Response:
     event: asyncio.Event
     data: Any = None
     err_msg: str = ''
+    metrics_info: MetricsInfo = None
 
 
 @dataclass
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 64d0e2d70e..f447b70dc9 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -7,7 +7,7 @@
 import numpy as np
 from torch import Tensor
 
-from lmdeploy.messages import GenerationConfig, LogitsProcessor
+from lmdeploy.messages import EngineCoreEvent, EngineCoreEventType, GenerationConfig, LogitsProcessor
 from lmdeploy.pytorch.disagg.request import MigrationRequest
 from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
 from lmdeploy.utils import get_logger
@@ -250,7 +250,7 @@ def add_sequence(self,
             num_new_tokens=0,
             sampling_param=sampling_param,
             adapter_name=adapter_name,
-            arrive_time=time.time(),
+            arrive_time=time.perf_counter(),
             history_embeddings=HistoryEmbeddings(input_embeddings),
             history_multimodals=HistoryMultiModals(multimodals),
             return_logits=return_logits,
@@ -468,6 +468,9 @@ class SchedulerSequence:
     resp_cache: bool = False
     preserve_cache: bool = False
 
+    # For logging
+    engine_core_events: List[EngineCoreEvent] = field(default_factory=list)
+
     def __post_init__(self):
         """Post init."""
         self._num_history_ids: int = 0
@@ -636,7 +639,7 @@ def update_token_ids(self,
             self._num_token_ids = len(token_ids)
         self.history_cache.append(token_ids)
         self.random_offsets += 1
-        self.arrive_time = time.time()
+        self.arrive_time = time.perf_counter()
 
     def set_step(self, step: int):
         """Set step."""
@@ -657,3 +660,10 @@ def set_step(self, step: int):
         if self.history_multimodals is not None:
             self._num_history_cross = self.history_multimodals.get_encoder_len(0, self.num_history_ids)
             self._num_cross = self.history_multimodals.get_encoder_len(self._num_history_ids, num_all_ids)
+
+    def record_event(
+        self,
+        event_type: EngineCoreEventType,
+        timestamp: Optional[float] = None,
+    ) -> None:
+        self.engine_core_events.append(EngineCoreEvent.new_event(event_type, timestamp))
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
index 8ba8bd9c68..f25f2c07e6 100644
--- a/lmdeploy/pytorch/paging/scheduler.py
+++ b/lmdeploy/pytorch/paging/scheduler.py
@@ -5,6 +5,7 @@
 from dataclasses import dataclass
 from typing import Dict, List
 
+from lmdeploy.messages import EngineCoreEventType
 from lmdeploy.utils import get_logger, logging_timer
 
 from ..config import CacheConfig, SchedulerConfig
@@ -135,6 +136,8 @@ def add_sequence(self, seq: SchedulerSequence):
         # push message to waiting queue
         self._set_message_status(seq, MessageStatus.WAITING)
 
+        seq.record_event(EngineCoreEventType.QUEUED)
+
     @logging_timer('ScheduleMigration', logger)
     def _schedule_migration(self):
         running_migration: SeqList = []
@@ -226,6 +229,8 @@ def _reorder_waiting():
             self.block_manager.allocate(seq)
             _to_running(seq)
 
+            seq.record_event(EngineCoreEventType.SCHEDULED)
+
         return running, swap_in_map, swap_out_map, copy_map
 
     @logging_timer('ScheduleDecoding', logger)
@@ -398,3 +403,13 @@ def collect_migration_done(self):
         migration_done = self.migration_done
         for seq in migration_done:
             self._set_message_status(seq, MessageStatus.RUNNING)
+
+    def make_stats(self):
+        """Make stats."""
+        return {
+            'running': self.num_running(),
+            'waiting': self.num_waiting(),
+            'locked': self.num_locked(),
+            'free_gpu_blocks': self.block_manager.get_num_free_gpu_blocks(),
+            'total_gpu_blocks': self.block_manager.num_gpu_blocks
+        }
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 000933ee81..218b5f28aa 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -21,6 +21,8 @@
 from lmdeploy.archs import get_model_arch
 from lmdeploy.logger import RequestLogger
 from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, Response, ResponseType, TurbomindEngineConfig
+from lmdeploy.metrics.metrics_processor import metrics_processor
+from lmdeploy.metrics.stats import IterationStats, RequestState
 from lmdeploy.model import MODELS, BaseChatTemplate, ChatTemplateConfig, best_match_model
 from lmdeploy.pytorch.disagg.request import DistServeConnectionRequest, DistServeInitRequest
 from lmdeploy.serve.utils import LogitsMixin
@@ -302,6 +304,9 @@ def __init__(self,
         self.internal_thread = _EventLoopThread(daemon=True)
         self.limiter: asyncio.Semaphore = None
 
+        # build stat loggers
+        self._build_stat_loggers()
+
     def close(self):
         self.internal_thread.close()
         self.free_insts = None
@@ -346,6 +351,22 @@ def _build_pytorch(self,
         self.backend_config = self.engine.engine_config
         self.hf_tm_cfg = getattr(self.engine.model_config, 'hf_config', None)
 
+    def _build_stat_loggers(self):
+        self.stat_loggers = []
+
+        if getattr(self.backend_config, 'enable_metrics', False):
+            from lmdeploy.metrics.loggers import LoggingStatLogger, PrometheusStatLogger
+            dp_rank = self.backend_config.dp_rank if self.backend_config.dp else 0
+
+            logger.info(f'enable metrics, with dp: {self.backend_config.dp} dp_rank: {dp_rank}')
+            self.stat_loggers = [
+                LoggingStatLogger(dp_rank=dp_rank),
+                PrometheusStatLogger(model_name=self.model_name, max_model_len=self.session_len, dp_rank=dp_rank)
+            ]
+
+            # set stats loggers of metrics processor
+            metrics_processor.stat_loggers = self.stat_loggers
+
     def __call__(self,
                  prompts: Union[List[str], str, List[Dict], List[List[Dict]]],
                  gen_config: Optional[GenerationConfig] = None,
@@ -377,6 +398,11 @@ def __call__(self,
                                 use_tqdm=use_tqdm,
                                 **kwargs)
 
+    async def do_log_stats(self):
+        # loop through CLI logger and Prometheus logger
+        for stat_logger in self.stat_loggers:
+            stat_logger.log()
+
     async def stop_session(self, session_id: int):
         """Stop a session by a session_id."""
         logger.info(f'stop session {session_id}')
@@ -718,6 +744,7 @@ def is_error(status):
         if skip_stop_tokens and not gen_config.ignore_eos:
             stop_ids = gen_config.stop_token_ids or []
 
+        metrics_processor.increment_total_requests()
         async with self.model_inst(session_id) as inst:
             token_ids = input_ids.copy()
             history_len = self.id2step[session_id]
@@ -738,12 +765,15 @@ def is_error(status):
                                      step=history_len) as gen:
                 prev_len = 0
                 hit_stop_token = 0
+                req_state = RequestState(prompt_len=input_len)  # per-requst state
                 async for outputs in gen:
+                    iteration_stats = IterationStats()  # per-iteration stats
                     # decode res
                     if is_error(outputs.status):
                         break
 
                     output_len = outputs.num_token
+                    metrics_processor.queue_update((input_len, prev_len, outputs, req_state, iteration_stats))
 
                     if hit_stop_token or prev_len == output_len:
                         continue
@@ -793,6 +823,7 @@ def is_error(status):
 
                     yield out
                 # end of generator loop
+                metrics_processor.increment_finished_requests()
 
                 if not is_error(outputs.status):
                     finish_reason = 'length' \
@@ -804,7 +835,7 @@ def is_error(status):
                         response = ''
                     logger.info(f'session {session_id} finished, reason '
                                 f'"{finish_reason}", input_tokens '
-                                f'{len(input_ids)}, outupt_tokens {gen_len}')
+                                f'{len(input_ids)}, output_tokens {gen_len}')
                     yield GenOut(response,
                                  self.id2step[session_id],
                                  len(input_ids),
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 99e6b4f895..972e37e353 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -4,7 +4,9 @@
 import copy
 import json
 import os
+import re
 import time
+from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Literal, Optional, Union
@@ -17,9 +19,11 @@
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.routing import Mount
 
 from lmdeploy.archs import get_task
 from lmdeploy.messages import GenerationConfig, LogitsProcessor, PytorchEngineConfig, TurbomindEngineConfig
+from lmdeploy.metrics.metrics_processor import metrics_processor
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.pytorch.disagg.config import DistServeEngineConfig
 from lmdeploy.pytorch.disagg.request import DistServeConnectionRequest, DistServeInitRequest, MigrationRequest
@@ -1201,6 +1205,50 @@ def set_parsers(reasoning_parser: Optional[str] = None, tool_parser: Optional[st
             )
 
 
+def mount_metrics(app: FastAPI, backend_config: Union[PytorchEngineConfig, TurbomindEngineConfig]):
+    if not getattr(backend_config, 'enable_metrics', False):
+        return
+
+    from prometheus_client import REGISTRY, make_asgi_app
+    registry = REGISTRY
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount('/metrics', make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile('^/metrics(?P<path>.*)$')
+    app.routes.append(metrics_route)
+
+
+def create_lifespan_handler(backend_config: Union[PytorchEngineConfig, TurbomindEngineConfig],
+                            async_engine: AsyncEngine):
+    """Factory function to create a lifespan handler."""
+
+    @asynccontextmanager
+    async def lifespan_handler(app: FastAPI):
+        task = None
+        try:
+            if getattr(backend_config, 'enable_metrics', False):
+                metrics_processor.start_metrics_handler(enable_metrics=True)
+                log_interval = 10.
+
+                async def _force_log():
+                    while True:
+                        await asyncio.sleep(log_interval)
+
+                        await async_engine.do_log_stats()
+
+                task = asyncio.create_task(_force_log())
+
+            yield
+        finally:
+            if task:
+                task.cancel()
+            await metrics_processor.stop_metrics_handler()
+
+    return lifespan_handler
+
+
 def serve(model_path: str,
           model_name: Optional[str] = None,
           backend: Literal['turbomind', 'pytorch'] = 'turbomind',
@@ -1279,31 +1327,6 @@ def serve(model_path: str,
         os.environ['TM_LOG_LEVEL'] = log_level
     logger.setLevel(log_level)
 
-    if disable_fastapi_docs:
-        app = FastAPI(
-            docs_url=None,
-            redoc_url=None,
-            openapi_url=None,
-        )
-    else:
-        app = FastAPI(docs_url='/')
-
-    app.include_router(router)
-    app.add_exception_handler(RequestValidationError, validation_exception_handler)
-
-    if allow_origins:
-        app.add_middleware(
-            CORSMiddleware,
-            allow_origins=allow_origins,
-            allow_credentials=allow_credentials,
-            allow_methods=allow_methods,
-            allow_headers=allow_headers,
-        )
-
-    # Set the maximum number of concurrent requests
-    if max_concurrent_requests is not None:
-        app.add_middleware(ConcurrencyLimitMiddleware, max_concurrent_requests=max_concurrent_requests)
-
     VariableInterface.allow_terminate_by_client = allow_terminate_by_client
     if api_keys is not None:
         if isinstance(api_keys, str):
@@ -1329,6 +1352,31 @@ def serve(model_path: str,
     # set reasoning parser and tool parser
     set_parsers(reasoning_parser, tool_call_parser)
 
+    # create FastAPI lifespan events
+    lifespan = create_lifespan_handler(backend_config, VariableInterface.async_engine)
+
+    if disable_fastapi_docs:
+        app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None, lifespan=lifespan)
+    else:
+        app = FastAPI(docs_url='/', lifespan=lifespan)
+
+    app.include_router(router)
+    app.add_exception_handler(RequestValidationError, validation_exception_handler)
+    mount_metrics(app, backend_config)
+
+    if allow_origins:
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=allow_origins,
+            allow_credentials=allow_credentials,
+            allow_methods=allow_methods,
+            allow_headers=allow_headers,
+        )
+
+    # set the maximum number of concurrent requests
+    if max_concurrent_requests is not None:
+        app.add_middleware(ConcurrencyLimitMiddleware, max_concurrent_requests=max_concurrent_requests)
+
     if proxy_url is not None:
         VariableInterface.proxy_url = proxy_url
         VariableInterface.api_server_url = f'{http_or_https}://{server_name}:{server_port}'  # noqa