Skip to content

Commit 58c123e

Browse files
seedspiritclaude
andcommitted
feat(BA-5878): emit stats.rate window queries for io_read/io_write/net_rx/net_tx
Legacy live_stat consumers — most importantly the legacy_compute_session.net_rx_bytes / net_tx_bytes / io_read_bytes / io_write_bytes GraphQL resolvers — read the "stats.rate" key from the per-metric live_stat dict. The new PromQL pipeline previously emitted nothing under that label, leaving those four legacy fields uncovered: - io_read / io_write: legacy agent's stats_filter={"rate"}, which the new pipeline did not produce. - net_rx / net_tx: legacy agent's stats_filter is empty, so the agent publishes no stats.rate at all even though the legacy resolver expects it — making the resolver always return 0. The new pipeline now produces a value where legacy never did. Two metric shapes flow through the new bucket: - Gauge-shape (net_rx, net_tx): the metric's `current` value is already a per-second rate (set by agent's current_hook = lambda m: m.stats.rate), so PromQL only needs to sum across replicas and label_replace to "stats.rate". - Counter-shape (io_read, io_write): the value is a cumulative byte counter, so PromQL applies rate(...[window]) before label_replace. Live verified against Prometheus on a running kernel: {net_rx, stats.rate} = 27530 {net_tx, stats.rate} = 30378 {io_read, stats.rate} = 0 {io_write, stats.rate} = 0 Re-introduces ValueType.RATE (and its to_legacy_live_stat_label / from_live_stat_label round-trip) that was removed earlier in the branch when no producer existed; the round-trip is now load-bearing again. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent e781b81 commit 58c123e

4 files changed

Lines changed: 66 additions & 3 deletions

File tree

src/ai/backend/common/clients/prometheus/fixed_query_builder.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
STATS_MAX_GAUGE_METRIC_PATTERNS,
1313
STATS_MAX_GAUGE_METRICS,
1414
STATS_MAX_OVER_RATE_METRICS,
15+
STATS_RATE_COUNTER_METRICS,
16+
STATS_RATE_GAUGE_METRICS,
1517
ContainerLiveStatQueries,
1618
ContainerMetricOptionalLabel,
1719
MetricType,
@@ -164,6 +166,32 @@ def get_container_live_stat_queries(
164166
rate=self._build_filtered_preset(kernel_ids, _RATE_LIVE_STAT_SPEC),
165167
max=self._build_window_stats_preset(kernel_ids, _MAX_STATS_BUCKET),
166168
avg=self._build_window_stats_preset(kernel_ids, _AVG_STATS_BUCKET),
169+
rate_stats=self._build_rate_stats_preset(kernel_ids),
170+
)
171+
172+
def _build_rate_stats_preset(
173+
self,
174+
kernel_ids: Sequence[KernelId],
175+
) -> MetricPreset:
176+
kernel_id_regex = _regex_union([str(kid) for kid in kernel_ids])
177+
group_by = ",".join(sorted(_LIVE_STAT_GROUP_BY))
178+
stat_label = ValueType.RATE.to_legacy_live_stat_label()
179+
parts: list[str] = []
180+
if STATS_RATE_GAUGE_METRICS:
181+
gauge_regex = _regex_union(sorted(STATS_RATE_GAUGE_METRICS))
182+
selector = self._utilization_selector(kernel_id_regex, gauge_regex)
183+
parts.append(self._labelled_sum(selector, group_by, stat_label))
184+
if STATS_RATE_COUNTER_METRICS:
185+
counter_regex = _regex_union(sorted(STATS_RATE_COUNTER_METRICS))
186+
base = self._utilization_selector(kernel_id_regex, counter_regex)
187+
selector = f"rate({base}[{self._timewindow}])"
188+
parts.append(self._labelled_sum(selector, group_by, stat_label))
189+
return MetricPreset(template=" or ".join(parts))
190+
191+
def _labelled_sum(self, selector: str, group_by: str, stat_label: str) -> str:
192+
return (
193+
f"label_replace(sum by ({group_by})({selector}),"
194+
f'"value_type","{stat_label}","value_type",".*")'
167195
)
168196

169197
def _build_window_stats_preset(

src/ai/backend/common/clients/prometheus/metric_types.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,17 @@ class MetricType(StrEnum):
6161

6262
@dataclass(frozen=True)
6363
class ContainerLiveStatQueries:
64-
"""Gauge / diff / rate / max / avg query preset bundle for container live stats."""
64+
"""Gauge / diff / rate / max / avg / rate_stats query preset bundle for container live stats."""
6565

6666
gauge: MetricPreset
6767
diff: MetricPreset
6868
rate: MetricPreset
6969
max: MetricPreset
7070
avg: MetricPreset
71+
rate_stats: MetricPreset
7172

7273
def to_list(self) -> list[MetricPreset]:
73-
return [self.gauge, self.diff, self.rate, self.max, self.avg]
74+
return [self.gauge, self.diff, self.rate, self.max, self.avg, self.rate_stats]
7475

7576

7677
DIFF_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
@@ -91,6 +92,16 @@ def to_list(self) -> list[MetricPreset]:
9192
STATS_MAX_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
9293
STATS_AVG_OVER_RATE_METRICS: Final[frozenset[str]] = frozenset({"cpu_util"})
9394

95+
# stats.rate emission targets the legacy stats.rate live_stat label.
96+
# Two metric shapes flow in:
97+
# * "gauge" set: agent's current_hook already publishes per-second rate as
98+
# the metric's `current` value, so we only need to sum across replicas
99+
# and relabel to stats.rate (no PromQL rate() wrap).
100+
# * "counter" set: the published series is a cumulative byte counter, so
101+
# we apply rate(...[window]) to get bytes/sec before relabel.
102+
STATS_RATE_GAUGE_METRICS: Final[frozenset[str]] = frozenset({"net_rx", "net_tx"})
103+
STATS_RATE_COUNTER_METRICS: Final[frozenset[str]] = frozenset({"io_read", "io_write"})
104+
94105

95106
@dataclass
96107
class ContainerMetricResponseInfo:

src/ai/backend/common/clients/prometheus/types.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class ValueType(StrEnum):
1010
PCT = "pct"
1111
MAX = "max"
1212
AVG = "avg"
13+
RATE = "rate"
1314

1415
@classmethod
1516
def from_live_stat_label(cls, value: str) -> "ValueType":
@@ -21,7 +22,7 @@ def from_live_stat_label(cls, value: str) -> "ValueType":
2122

2223
def to_legacy_live_stat_label(self) -> str:
2324
match self:
24-
case ValueType.MAX | ValueType.AVG:
25+
case ValueType.MAX | ValueType.AVG | ValueType.RATE:
2526
return f"stats.{self.value}"
2627
case _:
2728
return self.value

tests/unit/manager/services/utilization_metric/test_container_metric.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,29 @@ def test_stats_queries_render_legacy_labels_from_typed_value_types(self) -> None
847847
'"value_type","stats.avg","value_type",".*")'
848848
)
849849

850+
def test_rate_stats_query_renders_legacy_stats_rate_label(self) -> None:
851+
kernel_id = KernelId(UUID("12345678-1234-5678-1234-567812345678"))
852+
fixed_query_builder = FixedQueryBuilder("5m")
853+
854+
queries = fixed_query_builder.get_container_live_stat_queries([kernel_id])
855+
856+
assert queries.rate_stats.render() == (
857+
"label_replace(sum by (container_metric_name,kernel_id,value_type)("
858+
"backendai_container_utilization"
859+
'{kernel_id=~"12345678-1234-5678-1234-567812345678",'
860+
'container_metric_name=~"net_rx|net_tx",'
861+
'value_type="current"}),'
862+
'"value_type","stats.rate","value_type",".*")'
863+
" or "
864+
"label_replace(sum by (container_metric_name,kernel_id,value_type)(rate("
865+
"backendai_container_utilization"
866+
'{kernel_id=~"12345678-1234-5678-1234-567812345678",'
867+
'container_metric_name=~"io_read|io_write",'
868+
'value_type="current"}'
869+
"[5m])),"
870+
'"value_type","stats.rate","value_type",".*")'
871+
)
872+
850873

851874
class TestKernelMetricValuesByKernel:
852875
def test_from_prometheus_response_maps_legacy_stat_label_to_value_type(self) -> None:

0 commit comments

Comments
 (0)