33from collections .abc import Mapping , Sequence
44from dataclasses import dataclass
55from enum import StrEnum
6- from typing import Final , Self
6+ from typing import Final , Self , cast
7+ from uuid import UUID
78
9+ from ai .backend .common .clients .prometheus .preset import MetricPreset
810from ai .backend .common .clients .prometheus .types import MetricValue , ValueType
11+ from ai .backend .common .dto .clients .prometheus .response import (
12+ MetricResponseInfo ,
13+ PrometheusResponse ,
14+ )
15+ from ai .backend .common .exception import InvalidAPIParameters
916from ai .backend .common .types import KernelId
1017
1118__all__ = [
19+ "ContainerLiveStatQueries" ,
20+ "ContainerMetricOptionalLabel" ,
21+ "ContainerMetricResponseInfo" ,
22+ "ContainerMetricResult" ,
1223 "DIFF_METRICS" ,
1324 "KernelLiveStatBatchResult" ,
1425 "KernelLiveStatEntry" ,
26+ "KernelMetricValuesByKernel" ,
1527 "MetricValue" ,
28+ "MetricResultValue" ,
1629 "RATE_METRICS" ,
17- "UtilizationMetricType " ,
30+ "MetricType " ,
1831 "ValueType" ,
1932]
2033
2134
22- class UtilizationMetricType (StrEnum ):
35+ class MetricType (StrEnum ):
2336 """
2437 Specifies the type of a metric value.
2538 """
@@ -41,13 +54,81 @@ class UtilizationMetricType(StrEnum):
4154 """
4255
4356
44- # Metric-name -> UtilizationMetricType classification rules.
57+ # Metric-name -> MetricType classification rules.
4558# TODO: Refactor to query metric metadata from the repository layer once
4659# the metadata persistence is available.
60+
61+
62+ @dataclass (frozen = True )
63+ class ContainerLiveStatQueries :
64+ """Gauge / diff / rate query preset bundle for container live stats."""
65+
66+ gauge : MetricPreset
67+ diff : MetricPreset
68+ rate : MetricPreset
69+
70+ def to_list (self ) -> list [MetricPreset ]:
71+ return [self .gauge , self .diff , self .rate ]
72+
73+
4774DIFF_METRICS : Final [frozenset [str ]] = frozenset ({"cpu_util" })
4875RATE_METRICS : Final [frozenset [str ]] = frozenset ({"net_rx" , "net_tx" })
4976
5077
78+ @dataclass
79+ class ContainerMetricResponseInfo :
80+ value_type : str
81+ container_metric_name : str | None
82+ agent_id : str | None
83+ instance : str | None
84+ job : str | None
85+ kernel_id : str | None
86+ owner_project_id : str | None
87+ owner_user_id : str | None
88+ session_id : str | None
89+
90+ @classmethod
91+ def from_metric_response_info (cls , info : MetricResponseInfo ) -> Self :
92+ if info .value_type is None :
93+ raise InvalidAPIParameters (
94+ f"Missing required label 'value_type' for container metric (metric={ info .name !r} )"
95+ )
96+ return cls (
97+ value_type = info .value_type ,
98+ container_metric_name = info .container_metric_name ,
99+ agent_id = info .agent_id ,
100+ instance = info .instance ,
101+ job = info .job ,
102+ kernel_id = info .kernel_id ,
103+ owner_project_id = info .owner_project_id ,
104+ owner_user_id = info .owner_user_id ,
105+ session_id = info .session_id ,
106+ )
107+
108+
109+ @dataclass
110+ class MetricResultValue :
111+ timestamp : float
112+ value : str
113+
114+
115+ @dataclass
116+ class ContainerMetricOptionalLabel :
117+ value_type : ValueType
118+
119+ agent_id : str | None = None
120+ kernel_id : UUID | None = None
121+ session_id : UUID | None = None
122+ user_id : UUID | None = None
123+ project_id : UUID | None = None
124+
125+
126+ @dataclass
127+ class ContainerMetricResult :
128+ metric : ContainerMetricResponseInfo
129+ values : list [MetricResultValue ]
130+
131+
51132@dataclass (frozen = True )
52133class KernelLiveStatEntry :
53134 """All live_stat samples belonging to a single kernel.
@@ -61,7 +142,7 @@ class KernelLiveStatEntry:
61142
62143@dataclass (frozen = True )
63144class KernelLiveStatBatchResult :
64- # Per-kernel batch result for `query_kernel_live_stat_batch `
145+ # Per-kernel bulk result for `query_container_live_stats `
65146
66147 entries : dict [KernelId , KernelLiveStatEntry ]
67148
@@ -84,3 +165,43 @@ def from_metric_values(
84165 for kid in kernel_ids
85166 }
86167 )
168+
169+
170+ @dataclass (frozen = True )
171+ class KernelMetricValuesByKernel :
172+ values_by_kernel : dict [KernelId , list [MetricValue ]]
173+
174+ @classmethod
175+ def from_prometheus_response (cls , response : PrometheusResponse ) -> Self :
176+ grouped : dict [KernelId , list [MetricValue ]] = {}
177+ for metric in response .data .result :
178+ info = metric .metric
179+ if not info .has_container_metric_labels or not metric .values :
180+ continue
181+ # Non-None guaranteed by has_container_metric_labels above;
182+ # cast needed because property checks don't narrow types.
183+ kernel_id_str = cast (str , info .kernel_id )
184+ container_metric_name = cast (str , info .container_metric_name )
185+ value_type_str = cast (str , info .value_type )
186+ try :
187+ value_type = ValueType (value_type_str )
188+ kernel_id = KernelId (UUID (kernel_id_str ))
189+ except ValueError :
190+ continue
191+ # Instant queries are normalized into a one-element list, and range
192+ # queries are ordered by time, so the last sample is the newest one.
193+ _ , raw_value = metric .values [- 1 ]
194+ grouped .setdefault (kernel_id , []).append (
195+ MetricValue (
196+ metric_name = container_metric_name ,
197+ value_type = value_type ,
198+ value = raw_value ,
199+ )
200+ )
201+ return cls (values_by_kernel = grouped )
202+
203+ def merged_with (self , other : Self ) -> Self :
204+ merged = {kernel_id : list (values ) for kernel_id , values in self .values_by_kernel .items ()}
205+ for kernel_id , values in other .values_by_kernel .items ():
206+ merged .setdefault (kernel_id , []).extend (values )
207+ return type (self )(values_by_kernel = merged )
0 commit comments