diff --git a/.github/scripts/check_lmdeploy.py b/.github/scripts/check_lmdeploy.py index 04456cba7c..808fe7c4b3 100644 --- a/.github/scripts/check_lmdeploy.py +++ b/.github/scripts/check_lmdeploy.py @@ -18,6 +18,8 @@ def check_module_init(root: str): continue elif d.startswith('lmdeploy/lib'): continue + elif d.startswith('lmdeploy/monitoring'): + continue elif d.startswith('lmdeploy/serve/turbomind/triton_models'): continue elif d.startswith('lmdeploy/serve/turbomind/triton_python_backend'): diff --git a/docs/en/advance/metrics.md b/docs/en/advance/metrics.md new file mode 100644 index 0000000000..1f05807d9a --- /dev/null +++ b/docs/en/advance/metrics.md @@ -0,0 +1,179 @@ +# Production Metrics + +LMDeploy exposes a set of metrics via Prometheus, and provides visualization via Grafana. + +## Setup Guide + +This section describes how to set up the monitoring stack (Prometheus + Grafana) provided in the `lmdeploy/monitoring` directory. + +## Prerequisites + +- [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) installed + +- LMDeploy server running with metrics system enabled + +## Usage (DP = 1) + +1. **Start your LMDeploy server with metrics enabled** + +``` +lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics +``` + +Replace the model path according to your needs. +By default, the metrics endpoint will be available at `http://:23333/metrics`. + +2. **Navigate to the monitoring directory** + +``` +cd lmdeploy/monitoring +``` + +3. **Start the monitoring stack** + +``` +docker compose up +``` + +This command will start Prometheus and Grafana in the background. + +4. **Access the monitoring interfaces** + +- Prometheus: Open your web browser and go to http://localhost:9090. + +- Grafana: Open your web browser and go to http://localhost:3000. + +5. **Log in to Grafana** + +- Default Username: `admin` + +- Default Password: `admin` You will be prompted to change the password upon your first login. + +6. **View the Dashboard** + +The LMDeploy dashboard is pre-configured and should be available automatically. + +## Usage (DP > 1) + +1. **Start your LMDeploy server with metrics enabled** + +As an example, we use the model `Qwen/Qwen2.5-7B-Instruct` with `DP=2, TP=2`. Start the service as follows: + +```bash +# Proxy server +lmdeploy serve proxy --server-port 8000 --routing-strategy 'min_expected_latency' --serving-strategy Hybrid --log-level INFO + +# API server +LMDEPLOY_DP_MASTER_ADDR=127.0.0.1 \ +LMDEPLOY_DP_MASTER_PORT=29555 \ +lmdeploy serve api_server \ + Qwen/Qwen2.5-7B-Instruct \ + --backend pytorch \ + --tp 2 \ + --dp 2 \ + --proxy-url http://0.0.0.0:8000 \ + --nnodes 1 \ + --node-rank 0 \ + --enable-metrics +``` + +You should be able to see multiple API servers added to the proxy server list. Details can be found in `lmdeploy/serve/proxy/proxy_config.json`. + +For example, you may have the following API servers: + +``` +http://$host_ip:$api_server_port1 + +http://$host_ip:$api_server_port2 +``` + +2. **Modify the Prometheus configuration** + +When `DP > 1`, LMDeploy will launch one API server for each DP rank. If you want to monitor a specific API server, e.g. `http://$host_ip:$api_server_port1`, modify the configuration file `lmdeploy/monitoring/prometheus.yaml` as follows. + +> Note that you should use the actual host machine IP instead of `127.0.0.1` here, since LMDeploy starts the API server using the actual host IP when `DP > 1` + +``` +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: lmdeploy + static_configs: + - targets: + - '$host_ip:$api_server_port1' # <= Modify this +``` + +3. **Navigate to the monitoring folder and perform the same steps as described above** + +## Troubleshooting + +1. **Port conflicts** + +Check if any services are occupying ports `23333` (LMDeploy server port), `9090` (Prometheus port), or `3000` (Grafana port). You can either stop the conflicting running ports or modify the config files as follows: + +- Modify LMDeploy server port for Prometheus scrape + +In `lmdeploy/monitoring/prometheus.yaml` + +``` +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: lmdeploy + static_configs: + - targets: + - '127.0.0.1:23333' # <= Modify this LMDeploy server port 23333, need to match the running server port +``` + +- Modify Prometheus port + +In `lmdeploy/monitoring/grafana/datasources/datasource.yaml` + +``` +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://localhost:9090 # <= Modify this Prometheus interface port 9090 + isDefault: true + editable: false +``` + +- Modify Grafana port: + +In `lmdeploy/monitoring/docker-compose.yaml`, for example, change the port to `3090` + +Option 1: Add `GF_SERVER_HTTP_PORT` to the environment section. + +``` + environment: +- GF_AUTH_ANONYMOUS_ENABLED=true +- GF_SERVER_HTTP_PORT=3090 # <= Add this line +``` + +Option 2: Use port mapping. + +``` +grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3090:3000" # <= Host:Container port mapping +``` + +2. **No data on the dashboard** + +- Create traffic + +Try to send some requests to the LMDeploy server to create certain traffic + +``` +python3 benchmark/profile_restful_api.py --backend lmdeploy --num-prompts 5000 --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json +``` + +After refreshing, you should be able to see data on the dashboard. diff --git a/docs/en/index.rst b/docs/en/index.rst index ef6fa5a54d..57ba76bdaa 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -104,6 +104,7 @@ Documentation advance/structed_output.md advance/pytorch_multinodes.md advance/pytorch_profiling.md + advance/metrics.md .. toctree:: :maxdepth: 1 diff --git a/docs/zh_cn/advance/metrics.md b/docs/zh_cn/advance/metrics.md new file mode 100644 index 0000000000..7eb4eec5cc --- /dev/null +++ b/docs/zh_cn/advance/metrics.md @@ -0,0 +1,176 @@ +# 生产环境指标监控 + +LMDeploy 通过 Prometheus 暴露监控指标,并通过 Grafana 提供可视化界面。 + +## 配置指南 + +本节介绍如何设置 `lmdeploy/monitoring` 目录中提供的监控套件(Prometheus + Grafana) + +## 前提条件 + +- 已安装 [Docker](https://docs.docker.com/engine/install/) 和 [Docker Compose](https://docs.docker.com/compose/install/) + +- 已启用指标系统的 LMDeploy 服务正在运行 + +## 使用说明 (DP = 1) + +1. **启动已启用指标的 LMDeploy 服务** + +``` +lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics +``` + +请根据需求替换模型路径。默认 metrics endpoint 位于 `http://:23333/metrics`。 + +2. **进入监控目录** + +``` +cd lmdeploy/monitoring +``` + +3. **启动监控套件** + +``` +docker compose up +``` + +此命令将在后台启动 Prometheus 和 Grafana。 + +4. **访问监控界面** + +- Prometheus:浏览器访问 http://localhost:9090. + +- Grafana:浏览器访问 http://localhost:3000. + +5. **登录 Grafana** + +- 默认用户名:`admin` + +- 默认密码:`admin` (首次登录后会提示修改密码) + +6. **查看仪表盘** + +预配置的 LMDeploy 仪表盘将自动加载。 + +## 使用说明 (DP > 1) + +1. **启动已启用指标的 LMDeploy 服务** + +以模型 `Qwen/Qwen2.5-7B-Instruct` 为例,使用 `DP=2,TP=2` 启动服务: + +```bash +# Proxy server +lmdeploy serve proxy --server-port 8000 --routing-strategy 'min_expected_latency' --serving-strategy Hybrid --log-level INFO + +# API server +LMDEPLOY_DP_MASTER_ADDR=127.0.0.1 \ +LMDEPLOY_DP_MASTER_PORT=29555 \ +lmdeploy serve api_server \ + Qwen/Qwen2.5-7B-Instruct \ + --backend pytorch \ + --tp 2 \ + --dp 2 \ + --proxy-url http://0.0.0.0:8000 \ + --nnodes 1 \ + --node-rank 0 \ + --enable-metrics +``` + +您应该能在代理服务器列表中看到多个 API 服务实例。详细信息可以在 `lmdeploy/serve/proxy/proxy_config.json` 中找到。 + +例如,您可能会看到如下 API 服务地址: + +``` +http://$host_ip:$api_server_port1 + +http://$host_ip:$api_server_port2 +``` + +2. **修改 Prometheus 配置** + +当 DP > 1 时,LMDeploy 会为每个 DP Rank 启动一个 API 服务。如果你想监控其中某个 API 服务,例如:`http://$host_ip:$api_server_port1`,请修改配置文件 `lmdeploy/monitoring/prometheus.yaml` 如下所示。 + +> 注意:这里应使用实际主机的 IP 地址而非 127.0.0.1,因为当 DP > 1 时,LMDeploy 是通过实际主机 IP 启动 API 服务的。 + +``` +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: lmdeploy + static_configs: + - targets: + - '$host_ip:$api_server_port1' # <= 修改此处 +``` + +3. **进入监控目录并执行上述相同步骤** + +## 故障排除 + +1. **端口冲突** + +检查端口 `23333` (LMDeploy 服务端口)、`9090` (Prometheus 端口) 或 `3000` (Grafana 端口) 是否被占用。解决方案,关闭冲突的端口或如下修改配置文件: + +- 修改 Prometheus 抓取的 LMDeploy 服务端口 + +在 `lmdeploy/monitoring/prometheus.yaml` 中 + +``` +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: lmdeploy + static_configs: + - targets: + - '127.0.0.1:23333' # <= 修改此处的 LMDeploy 服务端口 23333,需与实际运行端口一致 +``` + +- 修改 Prometheus 端口 + +在 `lmdeploy/monitoring/grafana/datasources/datasource.yaml` 中 + +``` +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://localhost:9090 # <= 修改此处的 Prometheus 接口端口 9090 + isDefault: true + editable: false +``` + +- 修改 Grafana 端口 + +在 `lmdeploy/monitoring/docker-compose.yaml` 中操作(例如改为 3090 端口): + +方案一:在环境变量中添加 `GF_SERVER_HTTP_PORT` + +``` + environment: +- GF_AUTH_ANONYMOUS_ENABLED=true +- GF_SERVER_HTTP_PORT=3090 # <= 添加此行 +``` + +方案二:使用端口映射 + +``` +grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3090:3000" # <= 主机端口:容器端口映射 +``` + +- **仪表盘无数据** + +尝试向 LMDeploy 服务发送请求生成流量: + +``` +python3 benchmark/profile_restful_api.py --backend lmdeploy --num-prompts 5000 --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json +``` + +刷新后仪表盘应显示数据。 diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 35b6b74184..db10cbce13 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -105,6 +105,7 @@ LMDeploy 工具箱提供以下核心功能: advance/structed_output.md advance/pytorch_multinodes.md advance/pytorch_profiling.md + advance/metrics.md .. toctree:: :maxdepth: 1 diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 994ac1ea05..3250e6edc4 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -169,6 +169,7 @@ def add_parser_api_server(): ArgumentHelper.ep(pt_group) ArgumentHelper.enable_microbatch(pt_group) ArgumentHelper.enable_eplb(pt_group) + ArgumentHelper.enable_metrics(pt_group) ArgumentHelper.role(pt_group) ArgumentHelper.migration_backend(pt_group) # multi-node serving args @@ -333,6 +334,7 @@ def api_server(args): max_prefill_token_num=args.max_prefill_token_num, enable_microbatch=args.enable_microbatch, enable_eplb=args.enable_eplb, + enable_metrics=args.enable_metrics, role=EngineRole[args.role], migration_backend=MigrationBackend[args.migration_backend], model_format=args.model_format) diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index dc5767bea9..5c6ca0b478 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -557,6 +557,11 @@ def enable_eplb(parser): return parser.add_argument('--enable-eplb', action='store_true', help='enable eplb for specified model') + @staticmethod + def enable_metrics(parser): + """Add argument enable_metrics to parser.""" + parser.add_argument('--enable-metrics', action='store_true', default=False, help='enable metrics system') + # For Disaggregation @staticmethod def role(parser): diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index d067d2f7ce..f8efcff8a2 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import enum +import time from dataclasses import dataclass, field from typing import Callable, Dict, List, Literal, Optional @@ -314,6 +315,7 @@ class PytorchEngineConfig: it to True if you want to update weights after create the pipeline enable_microbatch (bool): enable microbatch for specified model enable_eplb (bool): enable eplb for specified model + enable_metrics (bool): enable metrics system role (EngineRole): role of engin, options: ['Hybrid', 'Prefill', 'Decode']. Default to `EngineRole.Hybrid`. migration_backend: migration backend. options: ['DLSlime']. @@ -349,6 +351,7 @@ class PytorchEngineConfig: enable_eplb: bool = False enable_mp_engine: bool = False model_format: str = None + enable_metrics: bool = False role: EngineRole = EngineRole.Hybrid migration_backend: MigrationBackend = MigrationBackend.DLSlime @@ -422,6 +425,45 @@ class Response: index: int = 0 +# copy from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py +class EngineCoreEventType(enum.IntEnum): + """The type of engine core request event. + + QUEUED - when the request was received by the engine core and added to the scheduler queue + SCHEDULED - when the request was first scheduled for execution + PREEMPTED - the request has been put back in the waiting queue in order to make room for other requests to complete. + It will be re-scheduled in future and re-start its prefill phase + """ + QUEUED = 1 + SCHEDULED = 2 + PREEMPTED = 3 # FIXME, currently ignored for simplicity + + +# copy from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py +@dataclass +class EngineCoreEvent(): + """A timestamped engine core event associated with a request. + + The timestamp is a monotonic timestamps and is used for by the engine frontend to calculate intervals between engine + core events. These timestamps should not be compared with timestamps from other processes. + """ + type: EngineCoreEventType + timestamp: float + + @classmethod + def new_event(cls, event_type: EngineCoreEventType, timestamp: Optional[float] = None) -> 'EngineCoreEvent': + timestamp = time.perf_counter() if timestamp is None else timestamp + return cls(event_type, timestamp) + + +@dataclass +class MetricsInfo: + """Metrics info from the inference engine.""" + engine_core_timestamp: float = 0.0 + engine_core_events: List[EngineCoreEvent] = field(default_factory=list) + scheduler_raw_info: dict = field(default_factory=dict) + + @dataclass class EngineOutput: """Engine output for turbomind/pytorch engine. @@ -435,6 +477,7 @@ class EngineOutput: position. cache_block_ids (List[int]): send cache blocks back for migration in Disaggregated LLM Serving when Prefill Engine is Done. + metrics_info (MetricsInfo): metrics info from the inference engine. """ status: ResponseType token_ids: List[int] @@ -444,6 +487,7 @@ class EngineOutput: last_hidden_state: torch.Tensor = None cache_block_ids: Optional[List[int]] = None + metrics_info: Optional[MetricsInfo] = None @dataclass diff --git a/lmdeploy/metrics/__init__.py b/lmdeploy/metrics/__init__.py new file mode 100644 index 0000000000..ef101fec61 --- /dev/null +++ b/lmdeploy/metrics/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py new file mode 100644 index 0000000000..226b633d66 --- /dev/null +++ b/lmdeploy/metrics/loggers.py @@ -0,0 +1,296 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# adapted from: https://github.com/vllm-project/vllm/blob/main/vllm/v1/metrics/loggers.py + +import time +from abc import ABC, abstractmethod +from datetime import datetime +from typing import List, Optional + +import numpy as np + +from lmdeploy.metrics.stats import IterationStats, SchedulerStats +from lmdeploy.utils import get_logger + +logger = get_logger('lmdeploy') + + +class StatLoggerBase(ABC): + + @abstractmethod + def record(self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]): + ... + + def log(self): # noqa + pass + + +class LoggingStatLogger(StatLoggerBase): + + def __init__(self, dp_rank: int = 0): + self.dp_rank = dp_rank + self._reset(time.perf_counter()) + self.last_scheduler_stats = SchedulerStats() + + def _reset(self, now): + self.last_log_time = now + + # Tracked stats over current local logging interval. + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] + + def _track_iteration_stats(self, iteration_stats: IterationStats): + # Save tracked stats for token counters. + self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens) + self.num_generation_tokens.append(iteration_stats.num_generation_tokens) + + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + # Compute summary metrics for tracked stats + return float(np.sum(tracked_stats) / (now - self.last_log_time)) + + def record(self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]): + """Log Stats to standard output.""" + + if iteration_stats: + self._track_iteration_stats(iteration_stats) + + self.last_scheduler_stats = scheduler_stats + + def log(self): + now = time.perf_counter() + prompt_throughput = self._get_throughput(self.num_prompt_tokens, now) + generation_throughput = self._get_throughput(self.num_generation_tokens, now) + + self._reset(now) + + scheduler_stats = self.last_scheduler_stats + + # Format and print output. + log_msg = (f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} " + f'DP{self.dp_rank}] ' + f'Avg prompt throughput: {prompt_throughput:.1f} tokens/s, ' + f'Avg generation throughput: {generation_throughput:.1f} tokens/s, ' + f'Finished: {scheduler_stats.num_finished_reqs} reqs, ' + f'Unfinished: {scheduler_stats.num_total_reqs-scheduler_stats.num_finished_reqs} reqs, ' + f'Running: {scheduler_stats.num_running_reqs} reqs, ' + f'Waiting: {scheduler_stats.num_waiting_reqs} reqs, ' + f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%') + print(log_msg) + + +class PrometheusStatLogger(StatLoggerBase): + + def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0): + try: + import prometheus_client + prometheus_client.disable_created_metrics() # disable noisy creation timestamp gauge in prometheus + except ImportError: + raise ImportError( + 'To use metrics system , please install prometheus_client by `pip install prometheus_client`') + + self.dp_rank = dp_rank + + # Unregister any existing lmdeploy collectors + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, '_name') and 'lmdeploy' in collector._name: + prometheus_client.REGISTRY.unregister(collector) + + # Config information + self.info_backend_config = prometheus_client.Info(name='lmdeploy:backend_config', + documentation='information of backend_config') + + labelnames = ['model_name', 'engine'] + labelvalues = [model_name, str(dp_rank)] + + # + # Scheduler state + # + self.gauge_scheduler_finished = prometheus_client.Gauge(name='lmdeploy:num_requests_finished', + documentation='Number of current finished requests.', + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_unfinished = prometheus_client.Gauge( + name='lmdeploy:num_requests_unfinished', + documentation='Number of current unfinished requests.', + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_running = prometheus_client.Gauge( + name='lmdeploy:num_requests_running', + documentation='Number of requests in model execution batches.', + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_waiting = prometheus_client.Gauge( + name='lmdeploy:num_requests_waiting', + documentation='Number of requests waiting to be processed.', + labelnames=labelnames).labels(*labelvalues) + + # + # GPU cache + # + self.gauge_gpu_cache_usage = prometheus_client.Gauge( + name='lmdeploy:gpu_cache_usage_perc', + documentation='GPU KV-cache usage. 1 means 100 percent usage.', + labelnames=labelnames).labels(*labelvalues) + + # + # Counters + # + self.counter_prompt_tokens = prometheus_client.Counter(name='lmdeploy:prompt_tokens_total', + documentation='Number of prefill tokens processed.', + labelnames=labelnames).labels(*labelvalues) + + self.counter_generation_tokens = prometheus_client.Counter( + name='lmdeploy:generation_tokens_total', + documentation='Number of generation tokens processed.', + labelnames=labelnames).labels(*labelvalues) + + from lmdeploy.messages import ResponseType + self.counter_request_success: dict[ResponseType, prometheus_client.Counter] = {} + counter_request_success_base = prometheus_client.Counter( + name='lmdeploy:request_success_total', + documentation='Count of successfully processed requests.', + labelnames=labelnames + ['finished_reason']) + for reason in ResponseType: + self.counter_request_success[reason] = counter_request_success_base.labels(*(labelvalues + [str(reason)])) + + # + # Histograms of counts + # + self.histogram_num_prompt_tokens_request = \ + prometheus_client.Histogram( + name='lmdeploy:request_prompt_tokens', + documentation='Number of prefill tokens processed.', + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + + self.histogram_num_generation_tokens_request = \ + prometheus_client.Histogram( + name='lmdeploy:request_generation_tokens', + documentation='Number of generation tokens processed.', + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + + self.histogram_iteration_tokens = \ + prometheus_client.Histogram( + name='lmdeploy:iteration_tokens_total', + documentation='Histogram of number of tokens per engine_step.', + buckets=[ + 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, + 16384 + ], + labelnames=labelnames).labels(*labelvalues) + + # + # Histogram of timing intervals + # + self.histogram_time_to_first_token = \ + prometheus_client.Histogram( + name='lmdeploy:time_to_first_token_seconds', + documentation='Histogram of time to first token in seconds.', + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, + 640.0, 2560.0 + ], + labelnames=labelnames).labels(*labelvalues) + + self.histogram_time_per_output_token = \ + prometheus_client.Histogram( + name='lmdeploy:time_per_output_token_seconds', + documentation='Histogram of time per output token in seconds.', + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames).labels(*labelvalues) + + request_latency_buckets = [ + 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, + 960.0, 1920.0, 7680.0 + ] + self.histogram_e2e_time_request = \ + prometheus_client.Histogram( + name='lmdeploy:e2e_request_latency_seconds', + documentation='Histogram of e2e request latency in seconds.', + buckets=request_latency_buckets, + labelnames=labelnames).labels(*labelvalues) + self.histogram_queue_time_request = \ + prometheus_client.Histogram( + name='lmdeploy:request_queue_time_seconds', + documentation='Histogram of time spent in WAITING phase for request.', + buckets=request_latency_buckets, + labelnames=labelnames).labels(*labelvalues) + self.histogram_inference_time_request = \ + prometheus_client.Histogram( + name='lmdeploy:request_inference_time_seconds', + documentation='Histogram of time spent in RUNNING phase for request.', + buckets=request_latency_buckets, + labelnames=labelnames).labels(*labelvalues) + self.histogram_prefill_time_request = \ + prometheus_client.Histogram( + name='lmdeploy:request_prefill_time_seconds', + documentation='Histogram of time spent in PREFILL phase for request.', + buckets=request_latency_buckets, + labelnames=labelnames).labels(*labelvalues) + self.histogram_decode_time_request = \ + prometheus_client.Histogram( + name='lmdeploy:request_decode_time_seconds', + documentation='Histogram of time spent in DECODE phase for request.', + buckets=request_latency_buckets, + labelnames=labelnames).labels(*labelvalues) + + def record(self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]): + """Log to prometheus.""" + + self.gauge_scheduler_finished.set(scheduler_stats.num_finished_reqs) + self.gauge_scheduler_unfinished.set(scheduler_stats.num_total_reqs - scheduler_stats.num_finished_reqs) + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + + if iteration_stats is None: + return + + self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) + self.counter_generation_tokens.inc(iteration_stats.num_generation_tokens) + self.histogram_iteration_tokens.observe(iteration_stats.num_prompt_tokens + + iteration_stats.num_generation_tokens) + + for ttft in iteration_stats.time_to_first_tokens_iter: + self.histogram_time_to_first_token.observe(ttft) + + for tpot in iteration_stats.time_per_output_tokens_iter: + self.histogram_time_per_output_token.observe(tpot) + + for finished_request in iteration_stats.finished_requests: + self.counter_request_success[finished_request.finish_reason].inc() + self.histogram_e2e_time_request.observe(finished_request.e2e_latency) + self.histogram_queue_time_request.observe(finished_request.queued_time) + self.histogram_prefill_time_request.observe(finished_request.prefill_time) + self.histogram_inference_time_request.observe(finished_request.inference_time) + self.histogram_decode_time_request.observe(finished_request.decode_time) + self.histogram_num_prompt_tokens_request.observe(finished_request.num_prompt_tokens) + self.histogram_num_generation_tokens_request.observe(finished_request.num_generation_tokens) + + +def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: + """Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values until the value exceeds the specified maximum.""" + exponent = 0 + buckets: List[int] = [] + while True: + for m in mantissa_lst: + value = m * 10**exponent + if value <= max_value: + buckets.append(value) + else: + return buckets + exponent += 1 + + +def build_1_2_5_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + return build_buckets([1, 2, 5], max_value) diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py new file mode 100644 index 0000000000..c0b4183e51 --- /dev/null +++ b/lmdeploy/metrics/metrics_processor.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import asyncio +from contextlib import contextmanager +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from lmdeploy.utils import get_logger + +from .stats import IterationStats, RequestState, SchedulerStats + +if TYPE_CHECKING: + from lmdeploy.messages import EngineOutput + +logger = get_logger('lmdeploy') + + +@dataclass +class MetricsContext: + enable_metrics: bool = False + scheduler_stats: SchedulerStats = SchedulerStats() + + +class MetricsManager: + + def __init__(self): + """Initialize metrics manager.""" + self._current_ctx = MetricsContext() + + def set_context(self, ctx: MetricsContext): + """Set metrics context.""" + self._current_ctx = ctx + + def get_context(self): + """Get current context.""" + return self._current_ctx + + @contextmanager + def context(self, ctx: MetricsContext): + """Context manager.""" + old_ctx = self.get_context() + self.set_context(ctx) + try: + yield ctx + finally: + self.set_context(old_ctx) + + +_METRICS_MANAGER = None + + +def get_metrics_manager(): + global _METRICS_MANAGER + if _METRICS_MANAGER is None: + _METRICS_MANAGER = MetricsManager() + + return _METRICS_MANAGER + + +# Metrics getters +def is_metrics_enabled(): + return get_metrics_manager().get_context().enable_metrics + + +def get_current_metrics_context(): + return get_metrics_manager().get_context() + + +def get_current_scheduler_stats(): + return get_metrics_manager().get_context().scheduler_stats + + +# Metrics setters +def set_metrics_enabled_flag(enable_metrics: bool): + """Set metrics enabled flag.""" + ctx = get_current_metrics_context() + ctx.enable_metrics = enable_metrics + + if enable_metrics: + logger.info('Metrics are enabled.') + + +def increment_async_engine_scheduler_stats_total_req(): + """Set scheduler stats in async engine.""" + get_current_scheduler_stats().num_total_reqs += 1 + + +def increment_async_engine_scheduler_stats_finished_req(): + """Set scheduler stats in async engine.""" + get_current_scheduler_stats().num_finished_reqs += 1 + + +# Metrics processor +class MetricsProcessor(): + """Metrics processor.""" + + def __init__(self): + self.metrics_queue: asyncio.Queue = None + self.metrics_handler: asyncio.Task = None + + def start_metrics_handler(self, enable_metrics: bool): + set_metrics_enabled_flag(enable_metrics) + + if enable_metrics and self.metrics_handler is None: + self.metrics_queue = asyncio.Queue() + self.metrics_handler = asyncio.create_task(self._run_metrics_handler()) + logger.info('Metrics handler task started.') + + async def stop_metrics_handler(self): + if self.metrics_handler is not None: + self.metrics_handler.cancel() + try: + await self.metrics_handler + except asyncio.CancelledError: + pass # Expected cancellation + finally: + self.metrics_handler = None + logger.info('Metrics handler task stopped.') + + async def _run_metrics_handler(self): + """A background task that consumes and processes metrics data.""" + while True: + try: + # fetch + update_data = await self.metrics_queue.get() + input_len, prev_len, outputs, req_state, iteration_stats = update_data + + # compute + self._update_stats(input_len, prev_len, outputs, req_state, iteration_stats) + + # record + scheduler_stats = get_current_scheduler_stats() + for stat_logger in self.stat_loggers: + stat_logger.record(scheduler_stats=scheduler_stats, iteration_stats=iteration_stats) + + self.metrics_queue.task_done() + except asyncio.CancelledError: + break + except Exception as e: + logger.exception(f'Metrics handler background task failed: {e}') + + def queue_update(self, update_data: tuple): + if not is_metrics_enabled() or self.metrics_queue is None: + return + + self.metrics_queue.put_nowait(update_data) + + def increment_total_requests(self): + increment_async_engine_scheduler_stats_total_req() + + def increment_finished_requests(self): + increment_async_engine_scheduler_stats_finished_req() + + def _update_stats(self, input_len: int, prev_len: int, outputs: 'EngineOutput', req_state: RequestState, + iteration_stats: IterationStats): + from lmdeploy.messages import ResponseType + + status = outputs.status + metrics_info = outputs.metrics_info + scheduler_raw_info = metrics_info.scheduler_raw_info + + # update scheduler stats + scheduler_stats = get_current_scheduler_stats() + # actual running requests + scheduler_stats.num_running_reqs = scheduler_raw_info['locked'] + # waiting to be scheduled + scheduled to running but haven't started yet + scheduler_stats.num_waiting_reqs = scheduler_raw_info['waiting'] + scheduler_raw_info['running'] + scheduler_stats.gpu_cache_usage = 1.0 - (scheduler_raw_info['free_gpu_blocks'] / + scheduler_raw_info['total_gpu_blocks']) + + # update from per-iteration outputs + iteration_stats.update_from_output(engine_core_timestamp=metrics_info.engine_core_timestamp, + engine_core_events=metrics_info.engine_core_events, + num_prompt_tokens=input_len, + num_new_generation_tokens=(outputs.num_token - prev_len), + is_prefilling=(prev_len == 0), + req_stats=req_state.stats) + + # update from finished request + if status is ResponseType.FINISH: + iteration_stats.update_from_finished_request(finish_reason=status, + num_prompt_tokens=input_len, + req_stats=req_state.stats) + + req_state.is_prefilling = False # change to decode after first update + + +metrics_processor = MetricsProcessor() diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py new file mode 100644 index 0000000000..eba75da8d8 --- /dev/null +++ b/lmdeploy/metrics/stats.py @@ -0,0 +1,214 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/v1/metrics/stats.py + +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, List, Optional + +if TYPE_CHECKING: + from lmdeploy.messages import EngineCoreEvent, ResponseType + + +@dataclass +class SchedulerStats: + """Stats associated with the scheduler.""" + + # total / finished reqs in async engine + num_total_reqs: int = 0 + num_finished_reqs: int = 0 + + # running / waiting reqs in PyTorch engine + num_running_reqs: int = 0 + num_waiting_reqs: int = 0 + + # GPU cache usage in PyTorch engine + gpu_cache_usage: float = 0.0 + + def __repr__(self): + """Return a human-readable string representation.""" + return ('SchedulerStats(\n' + f' num_total_reqs={self.num_total_reqs},\n' + f' num_finished_reqs={self.num_finished_reqs},\n' + f' num_running_reqs={self.num_running_reqs},\n' + f' num_waiting_reqs={self.num_waiting_reqs},\n' + f' gpu_cache_usage={self.gpu_cache_usage:.6f},\n' + ')') + + +class RequestState: + """State of a request.""" + + def __init__(self, arrival_time: float = None, prompt_len: int = 0, is_prefilling: Optional[bool] = True): + self.arrival_time = time.perf_counter() if arrival_time is None else arrival_time + self.prompt_len = prompt_len + self.is_prefilling = is_prefilling + self.stats = RequestStateStats(arrival_time=self.arrival_time) + + def __repr__(self): + """Return a human-readable string representation.""" + return ('RequestState(\n' + f' arrival_time={self.arrival_time:.6f},\n' + f' prompt_len={self.prompt_len},\n' + f' is_prefilling={self.is_prefilling},\n' + f' stats.num_generation_tokens={self.stats.num_generation_tokens},\n' + f' stats.arrival_time={self.stats.arrival_time:.6f},\n' + f' stats.queued_ts={self.stats.queued_ts:.6f},\n' + f' stats.scheduled_ts={self.stats.scheduled_ts:.6f},\n' + f' tats.first_token_ts={self.stats.first_token_ts:.6f},\n' + f' stats.last_token_ts={self.stats.last_token_ts:.6f},\n' + ')') + + +@dataclass +class RequestStateStats: + """Stats that need to be tracked across delta updates.""" + + num_generation_tokens: int = 0 + + # This is a engine frontend timestamp + arrival_time: float = 0.0 + + # These are engine core timestamps + queued_ts: float = 0.0 + scheduled_ts: float = 0.0 + first_token_ts: float = 0.0 + last_token_ts: float = 0.0 + + def __repr__(self): + """Return a human-readable string representation.""" + return ('RequestStateStats(\n' + f' num_generation_tokens={self.num_generation_tokens},\n' + f' arrival_time={self.arrival_time:.6f},\n' + f' queued_ts={self.queued_ts:.6f},\n' + f' scheduled_ts={self.scheduled_ts:.6f},\n' + f' first_token_ts={self.first_token_ts:.6f},\n' + f' last_token_ts={self.last_token_ts:.6f}\n' + ')') + + +@dataclass +class FinishedRequestStats: + """Stats associated with a finished request.""" + + finish_reason: 'ResponseType' + e2e_latency: float = 0.0 + num_prompt_tokens: int = 0 + num_generation_tokens: int = 0 + queued_time: float = 0.0 + prefill_time: float = 0.0 + inference_time: float = 0.0 + decode_time: float = 0.0 + + def __repr__(self): + """Return a human-readable string representation.""" + return ('FinishedRequestStats(\n' + f' finish_reason={self.finish_reason},\n' + f' e2e_latency={self.e2e_latency:.6f},\n' + f' num_prompt_tokens={self.num_prompt_tokens},\n' + f' num_generation_tokens={self.num_generation_tokens},\n' + f' queued_time={self.queued_time:.6f},\n' + f' prefill_time={self.prefill_time:.6f},\n' + f' inference_time={self.inference_time:.6f},\n' + f' decode_time={self.decode_time:.6f}\n' + ')') + + +class IterationStats: + """Stats associated with a single set of EngineCoreOutputs.""" + + def __init__(self): + self.iteration_timestamp = time.perf_counter() + self.num_generation_tokens = 0 + self.num_prompt_tokens = 0 + self.finished_requests: list[FinishedRequestStats] = [] + self.time_to_first_tokens_iter: list[float] = [] + self.time_per_output_tokens_iter: list[float] = [] + + def __repr__(self): + """Return a human-readable string representation.""" + return ('IterationStats(\n' + f' iteration_timestamp={self.iteration_timestamp:.6f},\n' + f' num_generation_tokens={self.num_generation_tokens},\n' + f' num_prompt_tokens={self.num_prompt_tokens},\n' + f' finished_requests_count={len(self.finished_requests)},\n' + f' time_to_first_tokens_iter={self.time_to_first_tokens_iter},\n' + f' time_per_output_tokens_iter={self.time_per_output_tokens_iter},\n' + ')') + + def _time_since(self, start: float) -> float: + """Calculate an interval relative to this iteration's timestamp.""" + return self.iteration_timestamp - start + + def update_from_output(self, engine_core_timestamp: float, engine_core_events: List['EngineCoreEvent'], + num_prompt_tokens: int, num_new_generation_tokens: int, is_prefilling: bool, + req_stats: RequestStateStats): + + self.num_generation_tokens += num_new_generation_tokens + if is_prefilling: + assert num_new_generation_tokens > 0 + self.num_prompt_tokens += num_prompt_tokens + + first_token_latency = self._time_since(req_stats.arrival_time) + assert first_token_latency > 0.0, f'TTFT cannot be negative: {first_token_latency:.6f}' + self.time_to_first_tokens_iter.append(first_token_latency) + + req_stats.num_generation_tokens += num_new_generation_tokens + + # Process request-level engine core events + if engine_core_events is not None: + self.update_from_events(engine_core_events, req_stats) + + # Process the batch-level "new tokens" engine core event + if is_prefilling: + req_stats.first_token_ts = engine_core_timestamp + else: + tpot = engine_core_timestamp - req_stats.last_token_ts + assert tpot > 0.0, f'TPOT cannot be negative: {tpot:.6f}' + self.time_per_output_tokens_iter.append(tpot) + + req_stats.last_token_ts = engine_core_timestamp + + def update_from_events(self, engine_core_events: List['EngineCoreEvent'], req_stats: RequestStateStats): + # Avoid circular dependency + from lmdeploy.messages import EngineCoreEventType + + for event in engine_core_events: + if event.type == EngineCoreEventType.QUEUED: + req_stats.queued_ts = event.timestamp + elif event.type == EngineCoreEventType.SCHEDULED: + if req_stats.scheduled_ts == 0.0: # ignore preemptions + req_stats.scheduled_ts = event.timestamp + # FIXME: deal with preempted case + # elif event.type == EngineCoreEventType.PREEMPTED: + # self.num_preempted_reqs += 1 + + def update_from_finished_request(self, finish_reason: 'ResponseType', num_prompt_tokens: int, + req_stats: RequestStateStats): + + e2e_latency = self._time_since(req_stats.arrival_time) + + # Queued interval is from first QUEUED event to first SCHEDULED + queued_time = req_stats.scheduled_ts - req_stats.queued_ts + + # Prefill interval is from first SCHEDULED to first NEW_TOKEN + # Any preemptions during prefill is included in the interval + prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts + + # Decode interval is from first NEW_TOKEN to last NEW_TOKEN + # Any preemptions during decode are included + decode_time = req_stats.last_token_ts - req_stats.first_token_ts + + # Inference interval is from first SCHEDULED to last NEW_TOKEN + # Any preemptions during prefill or decode are included + inference_time = req_stats.last_token_ts - req_stats.scheduled_ts + + finished_req = \ + FinishedRequestStats(finish_reason=finish_reason, + e2e_latency=e2e_latency, + num_prompt_tokens=num_prompt_tokens, + num_generation_tokens=req_stats.num_generation_tokens, + queued_time=queued_time, + prefill_time=prefill_time, + inference_time=inference_time, + decode_time=decode_time) + self.finished_requests.append(finished_req) diff --git a/lmdeploy/monitoring/docker-compose.yaml b/lmdeploy/monitoring/docker-compose.yaml new file mode 100644 index 0000000000..bd7e0e4817 --- /dev/null +++ b/lmdeploy/monitoring/docker-compose.yaml @@ -0,0 +1,29 @@ +# copy from https://github.com/sgl-project/sglang/blob/main/examples/monitoring/docker-compose.yaml +version: '3' +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + network_mode: host + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + + grafana: + image: grafana/grafana:latest + container_name: grafana + network_mode: host + volumes: + - ./grafana/datasources:/etc/grafana/provisioning/datasources + - ./grafana/dashboards/config:/etc/grafana/provisioning/dashboards + - ./grafana/dashboards/json:/var/lib/grafana/dashboards + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + - GF_AUTH_BASIC_ENABLED=false + - GF_USERS_ALLOW_SIGN_UP=false + - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/lmdeploy-dashboard.json + depends_on: + - prometheus diff --git a/lmdeploy/monitoring/grafana/dashboards/config/dashboard.yaml b/lmdeploy/monitoring/grafana/dashboards/config/dashboard.yaml new file mode 100644 index 0000000000..ea1a52325e --- /dev/null +++ b/lmdeploy/monitoring/grafana/dashboards/config/dashboard.yaml @@ -0,0 +1,11 @@ +apiVersion: 1 +providers: + - name: 'LMDeploy' + orgId: 1 + folder: 'LMDeploy Monitoring' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards diff --git a/lmdeploy/monitoring/grafana/dashboards/json/lmdeploy-dashboard.json b/lmdeploy/monitoring/grafana/dashboards/json/lmdeploy-dashboard.json new file mode 100644 index 0000000000..7f29f92af7 --- /dev/null +++ b/lmdeploy/monitoring/grafana/dashboards/json/lmdeploy-dashboard.json @@ -0,0 +1,1163 @@ +{ + "_comment": "json file adapted from https://github.com/vllm-project/vllm/blob/main/examples/online_serving/prometheus_grafana/grafana.json", + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Monitoring LMDeploy Inference Server", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "End to end request latency measured in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(lmdeploy:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(lmdeploy:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(lmdeploy:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "E2E Request Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of tokens processed per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(lmdeploy:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "Prompt Tokens/Sec", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(lmdeploy:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "Generation Tokens/Sec", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Token Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Inter token latency in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(lmdeploy:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(lmdeploy:time_per_output_token_seconds_bucket_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(lmdeploy:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Mean", + "range": true, + "refId": "E" + } + ], + "title": "Time Per Output Token Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "lmdeploy:num_requests_finished{model_name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Finished", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "lmdeploy:num_requests_unfinished{model_name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Unfinished", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "lmdeploy:num_requests_running{model_name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Running", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "lmdeploy:num_requests_waiting{model_name=\"$model_name\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Waiting", + "range": true, + "refId": "D", + "useBackend": false + } + ], + "title": "Scheduler State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "P50, P90, P95, and P99 TTFT latency in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(lmdeploy:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(lmdeploy:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(lmdeploy:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "Time To First Token Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percentage of used cache blocks by LMDeploy.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "lmdeploy:gpu_cache_usage_perc{model_name=\"$model_name\"}", + "instant": false, + "legendFormat": "GPU Cache Usage", + "range": true, + "refId": "A" + } + ], + "title": "Cache Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(finished_reason) (increase(lmdeploy:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Finish Reason", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(lmdeploy:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue Time", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "prometheus", + "value": "edx8memhpd9tsa" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct", + "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct" + }, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(model_name)", + "hide": 0, + "includeAll": false, + "label": "model_name", + "multi": false, + "name": "model_name", + "options": [], + "query": { + "query": "label_values(model_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "LMDeploy", + "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b", + "version": 8, + "weekStart": "" +} diff --git a/lmdeploy/monitoring/grafana/datasources/datasource.yaml b/lmdeploy/monitoring/grafana/datasources/datasource.yaml new file mode 100644 index 0000000000..1ab0e4a5fd --- /dev/null +++ b/lmdeploy/monitoring/grafana/datasources/datasource.yaml @@ -0,0 +1,8 @@ +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://localhost:9090 + isDefault: true + editable: false diff --git a/lmdeploy/monitoring/prometheus.yaml b/lmdeploy/monitoring/prometheus.yaml new file mode 100644 index 0000000000..3307d2bd4c --- /dev/null +++ b/lmdeploy/monitoring/prometheus.yaml @@ -0,0 +1,10 @@ +# prometheus.yaml +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: lmdeploy + static_configs: + - targets: + - '127.0.0.1:23333' diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py index bba4c8276a..9fd87647f7 100644 --- a/lmdeploy/pytorch/engine/engine.py +++ b/lmdeploy/pytorch/engine/engine.py @@ -9,7 +9,7 @@ import numpy as np import torch -from lmdeploy.messages import PytorchEngineConfig, ResponseType +from lmdeploy.messages import MetricsInfo, PytorchEngineConfig, ResponseType from lmdeploy.pytorch.disagg.config import EngineRole from lmdeploy.pytorch.disagg.messages import MigrationExecutionBatch from lmdeploy.utils import get_logger, get_max_batch_size, get_model, logging_timer @@ -46,6 +46,9 @@ class InferOutput: # when Prefill Engine is Done. cache_block_ids: List[int] = None + # for logging + metrics_info: MetricsInfo = None + def _tensorlize_block_offsets(block_offsets, dtype=torch.int32): """Tensorlize block_offsets.""" @@ -771,8 +774,8 @@ def update_running_migration(self, running: SeqList, next_token_ids: np.ndarray, msg.update_token_ids(update_token, model_meta=model_meta) msg.status = MessageStatus.STOPPED - def _make_infer_outputs(self, next_token_ids: torch.LongTensor, running: SeqList, logits: torch.Tensor, - stopped: torch.Tensor, model_metas: List[Dict[str, Any]]): + def _make_infer_outputs(self, new_token_timestamp: float, next_token_ids: torch.LongTensor, running: SeqList, + logits: torch.Tensor, stopped: torch.Tensor, model_metas: List[Dict[str, Any]]): """Make infer output.""" seq_length = [seq.num_token_ids for seq in running] @@ -794,11 +797,13 @@ def _make_infer_outputs(self, next_token_ids: torch.LongTensor, running: SeqList cache_block_ids = self.scheduler.block_manager.get_block_table(msg).tolist() else: cache_block_ids = None + metrics_info = MetricsInfo(new_token_timestamp, msg.engine_core_events, self.scheduler.make_stats()) out = InferOutput(session_id=session_id, resp=msg.resp, finish=finish, token_ids=token_ids, - cache_block_ids=cache_block_ids) + cache_block_ids=cache_block_ids, + metrics_info=metrics_info) outputs[session_id] = out if msg.return_logits: @@ -932,7 +937,10 @@ def __send_resp(out: InferOutput): resp_type = (ResponseType.FINISH if out.finish else ResponseType.SUCCESS) self._response(out.resp, resp_type, - data=dict(token_ids=out.token_ids, logits=out.logits, cache_block_ids=out.cache_block_ids)) + data=dict(token_ids=out.token_ids, + logits=out.logits, + cache_block_ids=out.cache_block_ids, + metrics_info=out.metrics_info)) def __send_resps(step_outputs: List[InferOutput]): """Send response callback.""" diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py index 4a10d08480..1a52357f5c 100644 --- a/lmdeploy/pytorch/engine/engine_instance.py +++ b/lmdeploy/pytorch/engine/engine_instance.py @@ -147,19 +147,29 @@ async def async_stream_infer(self, while True: resp = await self.req_sender.async_recv(resp) - cache_block_ids = resp.data.get('cache_block_ids', None) + cache_block_ids = resp.data.get('cache_block_ids', None) if resp.data else None + metrics_info = resp.data.get('metrics_info', None) if resp.data else None if resp.type == ResponseType.SUCCESS: token_ids = resp.data['token_ids'].tolist() num_ids = len(token_ids) logger.debug(f'session[{session_id}] success: num_out_ids={num_ids}.') - yield EngineOutput(resp.type, token_ids, num_ids, cache_block_ids=cache_block_ids) + yield EngineOutput(resp.type, + token_ids, + num_ids, + cache_block_ids=cache_block_ids, + metrics_info=metrics_info) elif resp.type == ResponseType.FINISH: resp_data = resp.data token_ids = resp_data['token_ids'].tolist() logits = resp_data['logits'] num_ids = len(token_ids) logger.debug(f'session[{session_id}] finish: num_out_ids={num_ids}.') - yield EngineOutput(resp.type, token_ids, num_ids, logits=logits, cache_block_ids=cache_block_ids) + yield EngineOutput(resp.type, + token_ids, + num_ids, + logits=logits, + cache_block_ids=cache_block_ids, + metrics_info=metrics_info) break else: logger.debug(f'session[{session_id}] failed.') diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py index 233425b37e..49614d521f 100644 --- a/lmdeploy/pytorch/engine/model_agent.py +++ b/lmdeploy/pytorch/engine/model_agent.py @@ -2,6 +2,7 @@ import asyncio import base64 import functools +import time from contextlib import asynccontextmanager, contextmanager from multiprocessing.reduction import ForkingPickler from typing import Any, Dict @@ -776,6 +777,7 @@ async def get_output_async(self): with torch.cuda.stream(self.out_stream), torch.inference_mode(), record_function('outputs_D2H'): out['next_token_ids'] = out['next_token_ids'].cpu() out['stopped'] = out['stopped'].cpu() + out['new_token_timestamp'] = time.perf_counter() if out['logits'] is not None: out['logits'] = out['logits'].cpu() return out diff --git a/lmdeploy/pytorch/engine/request.py b/lmdeploy/pytorch/engine/request.py index b39519d970..186daac9b7 100644 --- a/lmdeploy/pytorch/engine/request.py +++ b/lmdeploy/pytorch/engine/request.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, field from typing import Any, Awaitable, Callable, Dict, List -from lmdeploy.messages import ResponseType +from lmdeploy.messages import MetricsInfo, ResponseType from lmdeploy.utils import get_logger logger = get_logger('lmdeploy') @@ -31,6 +31,7 @@ class Response: event: asyncio.Event data: Any = None err_msg: str = '' + metrics_info: MetricsInfo = None @dataclass diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py index 64d0e2d70e..f447b70dc9 100644 --- a/lmdeploy/pytorch/messages.py +++ b/lmdeploy/pytorch/messages.py @@ -7,7 +7,7 @@ import numpy as np from torch import Tensor -from lmdeploy.messages import GenerationConfig, LogitsProcessor +from lmdeploy.messages import EngineCoreEvent, EngineCoreEventType, GenerationConfig, LogitsProcessor from lmdeploy.pytorch.disagg.request import MigrationRequest from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs from lmdeploy.utils import get_logger @@ -250,7 +250,7 @@ def add_sequence(self, num_new_tokens=0, sampling_param=sampling_param, adapter_name=adapter_name, - arrive_time=time.time(), + arrive_time=time.perf_counter(), history_embeddings=HistoryEmbeddings(input_embeddings), history_multimodals=HistoryMultiModals(multimodals), return_logits=return_logits, @@ -468,6 +468,9 @@ class SchedulerSequence: resp_cache: bool = False preserve_cache: bool = False + # For logging + engine_core_events: List[EngineCoreEvent] = field(default_factory=list) + def __post_init__(self): """Post init.""" self._num_history_ids: int = 0 @@ -636,7 +639,7 @@ def update_token_ids(self, self._num_token_ids = len(token_ids) self.history_cache.append(token_ids) self.random_offsets += 1 - self.arrive_time = time.time() + self.arrive_time = time.perf_counter() def set_step(self, step: int): """Set step.""" @@ -657,3 +660,10 @@ def set_step(self, step: int): if self.history_multimodals is not None: self._num_history_cross = self.history_multimodals.get_encoder_len(0, self.num_history_ids) self._num_cross = self.history_multimodals.get_encoder_len(self._num_history_ids, num_all_ids) + + def record_event( + self, + event_type: EngineCoreEventType, + timestamp: Optional[float] = None, + ) -> None: + self.engine_core_events.append(EngineCoreEvent.new_event(event_type, timestamp)) diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py index 8ba8bd9c68..f25f2c07e6 100644 --- a/lmdeploy/pytorch/paging/scheduler.py +++ b/lmdeploy/pytorch/paging/scheduler.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from typing import Dict, List +from lmdeploy.messages import EngineCoreEventType from lmdeploy.utils import get_logger, logging_timer from ..config import CacheConfig, SchedulerConfig @@ -135,6 +136,8 @@ def add_sequence(self, seq: SchedulerSequence): # push message to waiting queue self._set_message_status(seq, MessageStatus.WAITING) + seq.record_event(EngineCoreEventType.QUEUED) + @logging_timer('ScheduleMigration', logger) def _schedule_migration(self): running_migration: SeqList = [] @@ -226,6 +229,8 @@ def _reorder_waiting(): self.block_manager.allocate(seq) _to_running(seq) + seq.record_event(EngineCoreEventType.SCHEDULED) + return running, swap_in_map, swap_out_map, copy_map @logging_timer('ScheduleDecoding', logger) @@ -398,3 +403,13 @@ def collect_migration_done(self): migration_done = self.migration_done for seq in migration_done: self._set_message_status(seq, MessageStatus.RUNNING) + + def make_stats(self): + """Make stats.""" + return { + 'running': self.num_running(), + 'waiting': self.num_waiting(), + 'locked': self.num_locked(), + 'free_gpu_blocks': self.block_manager.get_num_free_gpu_blocks(), + 'total_gpu_blocks': self.block_manager.num_gpu_blocks + } diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 000933ee81..218b5f28aa 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -21,6 +21,8 @@ from lmdeploy.archs import get_model_arch from lmdeploy.logger import RequestLogger from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, Response, ResponseType, TurbomindEngineConfig +from lmdeploy.metrics.metrics_processor import metrics_processor +from lmdeploy.metrics.stats import IterationStats, RequestState from lmdeploy.model import MODELS, BaseChatTemplate, ChatTemplateConfig, best_match_model from lmdeploy.pytorch.disagg.request import DistServeConnectionRequest, DistServeInitRequest from lmdeploy.serve.utils import LogitsMixin @@ -302,6 +304,9 @@ def __init__(self, self.internal_thread = _EventLoopThread(daemon=True) self.limiter: asyncio.Semaphore = None + # build stat loggers + self._build_stat_loggers() + def close(self): self.internal_thread.close() self.free_insts = None @@ -346,6 +351,22 @@ def _build_pytorch(self, self.backend_config = self.engine.engine_config self.hf_tm_cfg = getattr(self.engine.model_config, 'hf_config', None) + def _build_stat_loggers(self): + self.stat_loggers = [] + + if getattr(self.backend_config, 'enable_metrics', False): + from lmdeploy.metrics.loggers import LoggingStatLogger, PrometheusStatLogger + dp_rank = self.backend_config.dp_rank if self.backend_config.dp else 0 + + logger.info(f'enable metrics, with dp: {self.backend_config.dp} dp_rank: {dp_rank}') + self.stat_loggers = [ + LoggingStatLogger(dp_rank=dp_rank), + PrometheusStatLogger(model_name=self.model_name, max_model_len=self.session_len, dp_rank=dp_rank) + ] + + # set stats loggers of metrics processor + metrics_processor.stat_loggers = self.stat_loggers + def __call__(self, prompts: Union[List[str], str, List[Dict], List[List[Dict]]], gen_config: Optional[GenerationConfig] = None, @@ -377,6 +398,11 @@ def __call__(self, use_tqdm=use_tqdm, **kwargs) + async def do_log_stats(self): + # loop through CLI logger and Prometheus logger + for stat_logger in self.stat_loggers: + stat_logger.log() + async def stop_session(self, session_id: int): """Stop a session by a session_id.""" logger.info(f'stop session {session_id}') @@ -718,6 +744,7 @@ def is_error(status): if skip_stop_tokens and not gen_config.ignore_eos: stop_ids = gen_config.stop_token_ids or [] + metrics_processor.increment_total_requests() async with self.model_inst(session_id) as inst: token_ids = input_ids.copy() history_len = self.id2step[session_id] @@ -738,12 +765,15 @@ def is_error(status): step=history_len) as gen: prev_len = 0 hit_stop_token = 0 + req_state = RequestState(prompt_len=input_len) # per-requst state async for outputs in gen: + iteration_stats = IterationStats() # per-iteration stats # decode res if is_error(outputs.status): break output_len = outputs.num_token + metrics_processor.queue_update((input_len, prev_len, outputs, req_state, iteration_stats)) if hit_stop_token or prev_len == output_len: continue @@ -793,6 +823,7 @@ def is_error(status): yield out # end of generator loop + metrics_processor.increment_finished_requests() if not is_error(outputs.status): finish_reason = 'length' \ @@ -804,7 +835,7 @@ def is_error(status): response = '' logger.info(f'session {session_id} finished, reason ' f'"{finish_reason}", input_tokens ' - f'{len(input_ids)}, outupt_tokens {gen_len}') + f'{len(input_ids)}, output_tokens {gen_len}') yield GenOut(response, self.id2step[session_id], len(input_ids), diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 99e6b4f895..972e37e353 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -4,7 +4,9 @@ import copy import json import os +import re import time +from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus from typing import AsyncGenerator, Dict, List, Literal, Optional, Union @@ -17,9 +19,11 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer from starlette.middleware.base import BaseHTTPMiddleware +from starlette.routing import Mount from lmdeploy.archs import get_task from lmdeploy.messages import GenerationConfig, LogitsProcessor, PytorchEngineConfig, TurbomindEngineConfig +from lmdeploy.metrics.metrics_processor import metrics_processor from lmdeploy.model import ChatTemplateConfig from lmdeploy.pytorch.disagg.config import DistServeEngineConfig from lmdeploy.pytorch.disagg.request import DistServeConnectionRequest, DistServeInitRequest, MigrationRequest @@ -1201,6 +1205,50 @@ def set_parsers(reasoning_parser: Optional[str] = None, tool_parser: Optional[st ) +def mount_metrics(app: FastAPI, backend_config: Union[PytorchEngineConfig, TurbomindEngineConfig]): + if not getattr(backend_config, 'enable_metrics', False): + return + + from prometheus_client import REGISTRY, make_asgi_app + registry = REGISTRY + + # Add prometheus asgi middleware to route /metrics requests + metrics_route = Mount('/metrics', make_asgi_app(registry=registry)) + + # Workaround for 307 Redirect for /metrics + metrics_route.path_regex = re.compile('^/metrics(?P.*)$') + app.routes.append(metrics_route) + + +def create_lifespan_handler(backend_config: Union[PytorchEngineConfig, TurbomindEngineConfig], + async_engine: AsyncEngine): + """Factory function to create a lifespan handler.""" + + @asynccontextmanager + async def lifespan_handler(app: FastAPI): + task = None + try: + if getattr(backend_config, 'enable_metrics', False): + metrics_processor.start_metrics_handler(enable_metrics=True) + log_interval = 10. + + async def _force_log(): + while True: + await asyncio.sleep(log_interval) + + await async_engine.do_log_stats() + + task = asyncio.create_task(_force_log()) + + yield + finally: + if task: + task.cancel() + await metrics_processor.stop_metrics_handler() + + return lifespan_handler + + def serve(model_path: str, model_name: Optional[str] = None, backend: Literal['turbomind', 'pytorch'] = 'turbomind', @@ -1279,31 +1327,6 @@ def serve(model_path: str, os.environ['TM_LOG_LEVEL'] = log_level logger.setLevel(log_level) - if disable_fastapi_docs: - app = FastAPI( - docs_url=None, - redoc_url=None, - openapi_url=None, - ) - else: - app = FastAPI(docs_url='/') - - app.include_router(router) - app.add_exception_handler(RequestValidationError, validation_exception_handler) - - if allow_origins: - app.add_middleware( - CORSMiddleware, - allow_origins=allow_origins, - allow_credentials=allow_credentials, - allow_methods=allow_methods, - allow_headers=allow_headers, - ) - - # Set the maximum number of concurrent requests - if max_concurrent_requests is not None: - app.add_middleware(ConcurrencyLimitMiddleware, max_concurrent_requests=max_concurrent_requests) - VariableInterface.allow_terminate_by_client = allow_terminate_by_client if api_keys is not None: if isinstance(api_keys, str): @@ -1329,6 +1352,31 @@ def serve(model_path: str, # set reasoning parser and tool parser set_parsers(reasoning_parser, tool_call_parser) + # create FastAPI lifespan events + lifespan = create_lifespan_handler(backend_config, VariableInterface.async_engine) + + if disable_fastapi_docs: + app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None, lifespan=lifespan) + else: + app = FastAPI(docs_url='/', lifespan=lifespan) + + app.include_router(router) + app.add_exception_handler(RequestValidationError, validation_exception_handler) + mount_metrics(app, backend_config) + + if allow_origins: + app.add_middleware( + CORSMiddleware, + allow_origins=allow_origins, + allow_credentials=allow_credentials, + allow_methods=allow_methods, + allow_headers=allow_headers, + ) + + # set the maximum number of concurrent requests + if max_concurrent_requests is not None: + app.add_middleware(ConcurrencyLimitMiddleware, max_concurrent_requests=max_concurrent_requests) + if proxy_url is not None: VariableInterface.proxy_url = proxy_url VariableInterface.api_server_url = f'{http_or_https}://{server_name}:{server_port}' # noqa