Skip to content

[Feature] metrics support #3534

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion lmdeploy/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ def add_parser_api_server():
'engine’s tasks once the maximum number of concurrent requests is '
'reached, regardless of any additional requests sent by clients '
'concurrently during that time. Default to None.')
# FIXME: change default value to False
parser.add_argument('--enable-metrics',
action='store_true',
default=True,
help='Whether log stats to cli / prometheus')
# common args
ArgumentHelper.backend(parser)
ArgumentHelper.log_level(parser)
Expand Down Expand Up @@ -272,7 +277,8 @@ def gradio(args):
device_type=args.device,
quant_policy=args.quant_policy,
eager_mode=args.eager_mode,
max_prefill_token_num=args.max_prefill_token_num)
max_prefill_token_num=args.max_prefill_token_num,
enable_metrics=args.enable_metrics)
else:
backend_config = TurbomindEngineConfig(dtype=args.dtype,
tp=args.tp,
Expand Down Expand Up @@ -369,6 +375,7 @@ def api_server(args):
max_log_len=args.max_log_len,
disable_fastapi_docs=args.disable_fastapi_docs,
max_concurrent_requests=args.max_concurrent_requests,
enable_metrics=args.enable_metrics,
reasoning_parser=args.reasoning_parser,
tool_call_parser=args.tool_call_parser)

Expand Down
53 changes: 53 additions & 0 deletions lmdeploy/messages.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import enum
import time
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Literal, Optional

Expand All @@ -9,6 +10,7 @@
from lmdeploy.pytorch.disagg.config import EngineRole, MigrationBackend
from lmdeploy.pytorch.disagg.request import MigrationRequest

from .metrics.stats import IterationStats, RequestStateStats, SchedulerStats
from .tokenizer import Tokenizer
from .utils import get_logger

Expand Down Expand Up @@ -310,6 +312,7 @@ class PytorchEngineConfig:
'Decode']. Default to `EngineRole.Hybrid`.
migration_backend: migration backend. options: ['DLSlime'].
Default to `MigrationBackend.DLSlime`.
enable_metrics (bool): Whether log stats to cli / prometheus
"""
dtype: str = 'auto'
tp: int = 1
Expand Down Expand Up @@ -338,6 +341,7 @@ class PytorchEngineConfig:

role: EngineRole = EngineRole.Hybrid
migration_backend: MigrationBackend = MigrationBackend.DLSlime
enable_metrics: bool = False

def __post_init__(self):
"""Check input validation."""
Expand Down Expand Up @@ -407,6 +411,34 @@ class Response:
last_hidden_state: torch.Tensor = None
index: int = 0

scheduler_stats: SchedulerStats = None
iteration_stats: IterationStats = None


# copy from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py
class EngineCoreEventType(enum.IntEnum):
"""The type of engine core request event."""
QUEUED = 1
SCHEDULED = 2
PREEMPTED = 3 # FIXME, currently ignored for simplicity


# copy from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py
@dataclass
class EngineCoreEvent():
"""A timestamped engine core event associated with a request.

The timestamp is a monotonic timestamps and is used for by the engine frontend to calculate intervals between engine
core events. These timestamps should not be compared with timestamps from other processes.
"""
type: EngineCoreEventType
timestamp: float

@classmethod
def new_event(cls, event_type: EngineCoreEventType, timestamp: Optional[float] = None) -> 'EngineCoreEvent':
timestamp = time.monotonic() if timestamp is None else timestamp
return cls(event_type, timestamp)


@dataclass
class EngineOutput:
Expand All @@ -431,6 +463,27 @@ class EngineOutput:

cache_block_ids: Optional[List[int]] = None

# engine-side time stamp, for logging
timestamp: float = 0.0
scheduler_stats: SchedulerStats = None
iteration_stats: IterationStats = None
events: List[EngineCoreEvent] = None

def __post_init__(self):
if self.timestamp == 0.0:
self.timestamp = time.monotonic()


@dataclass
class RequestState:
"""per request state."""

def __init__(self, arrival_time: float, prompt_len: int, is_prefilling: bool, enable_metrics: bool):

self.prompt_len: int = prompt_len
self.is_prefilling: bool = is_prefilling
self.stats = RequestStateStats(arrival_time=arrival_time) if enable_metrics else None


@dataclass
class VisionConfig:
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright (c) OpenMMLab. All rights reserved.
Loading
Loading