Skip to content

Commit bf2d491

Browse files
seedspiritclaude
andauthored
fix(BA-5768): rewrite loopback targets in Prometheus relabel for model-service scraping (#11170)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6c60edd commit bf2d491

4 files changed

Lines changed: 232 additions & 0 deletions

File tree

changes/11170.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix Prometheus not scraping model-service metrics by rewriting loopback addresses via relabel_configs

configs/prometheus/prometheus.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ scrape_configs:
1818
http_sd_configs:
1919
- url: 'http://host.docker.internal:18080/metrics/service_discovery'
2020
refresh_interval: "60s"
21+
relabel_configs:
22+
# Rewrite loopback targets to host-accessible address for Docker-based Prometheus
23+
- source_labels: [__address__]
24+
regex: '127\.0\.0\.1(.*)'
25+
target_label: __address__
26+
replacement: 'host.docker.internal${1}'
2127
- job_name: 'etcd'
2228
scrape_interval: 10s
2329
metrics_path: /metrics

src/ai/backend/install/pyinfra/deploy/monitor/dashboard/prometheus/templates/prometheus.yml.j2

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ scrape_configs:
2828
http_sd_configs:
2929
- url: 'http://{{ http_sd_host }}:{{ http_sd_port }}/metrics/service_discovery'
3030
refresh_interval: "60s"
31+
relabel_configs:
32+
# Rewrite loopback targets to host-accessible address for Docker-based Prometheus
33+
- source_labels: [__address__]
34+
regex: '127\.0\.0\.1(.*)'
35+
target_label: __address__
36+
replacement: '{{ http_sd_host }}${1}'
3137

3238
# ETCD Cluster
3339
- job_name: 'etcd'
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
"""Component tests: Prometheus scrapes model-service targets after relabel rewrite."""
2+
3+
from __future__ import annotations
4+
5+
import asyncio
6+
import secrets
7+
import textwrap
8+
import time
9+
from collections.abc import AsyncIterator, Iterator
10+
from pathlib import Path
11+
12+
import pytest
13+
from aiohttp import web
14+
from testcontainers.core.container import DockerContainer
15+
from testcontainers.core.waiting_utils import wait_for_logs
16+
17+
from ai.backend.common.clients.http_client.client_pool import (
18+
ClientPool,
19+
tcp_client_session_factory,
20+
)
21+
from ai.backend.common.clients.prometheus import LabelMatcher, MetricPreset, PrometheusClient
22+
from ai.backend.common.dto.clients.prometheus import PrometheusResponse
23+
from ai.backend.common.service_discovery.service_discovery import MODEL_SERVICE_GROUP
24+
from ai.backend.common.typed_validators import HostPortPair as HostPortPairModel
25+
from ai.backend.testutils.pants import get_parallel_slot
26+
27+
# ---------------------------------------------------------------------------
28+
# Fixtures: mock HTTP servers (SD endpoint + metrics endpoint)
29+
# ---------------------------------------------------------------------------
30+
31+
32+
@pytest.fixture
33+
async def mock_metrics_server() -> AsyncIterator[int]:
34+
"""A minimal /metrics endpoint that Prometheus can scrape."""
35+
36+
async def handle_metrics(_request: web.Request) -> web.Response:
37+
return web.Response(
38+
text=("# HELP test_gauge A test gauge\n# TYPE test_gauge gauge\ntest_gauge 42\n"),
39+
content_type="text/plain",
40+
)
41+
42+
app = web.Application()
43+
app.router.add_get("/metrics", handle_metrics)
44+
runner = web.AppRunner(app)
45+
await runner.setup()
46+
site = web.TCPSite(runner, "0.0.0.0", 0)
47+
await site.start()
48+
port = site._server.sockets[0].getsockname()[1] # type: ignore[union-attr]
49+
try:
50+
yield port
51+
finally:
52+
await runner.cleanup()
53+
54+
55+
@pytest.fixture
56+
async def mock_sd_server(mock_metrics_server: int) -> AsyncIterator[int]:
57+
"""HTTP SD endpoint returning a model-service target with Docker-internal IP.
58+
59+
The target address uses 127.0.0.1 (simulating a Docker-internal IP),
60+
which will be rewritten by relabel_configs to host.docker.internal.
61+
The actual metrics server runs on the same port, reachable via
62+
host.docker.internal from the Prometheus container.
63+
"""
64+
65+
async def handle_sd(_request: web.Request) -> web.Response:
66+
# 127.0.0.1 simulates a Docker-internal IP that is unreachable
67+
# from the Prometheus container. The relabel rule will rewrite it
68+
# to host.docker.internal, which IS reachable.
69+
return web.json_response([
70+
{
71+
"targets": [f"127.0.0.1:{mock_metrics_server}"],
72+
"labels": {
73+
"service_group": MODEL_SERVICE_GROUP,
74+
"service_id": "test-model-svc-001",
75+
"display_name": "test-model",
76+
"version": "1.0",
77+
},
78+
},
79+
])
80+
81+
app = web.Application()
82+
app.router.add_get("/metrics/service_discovery", handle_sd)
83+
runner = web.AppRunner(app)
84+
await runner.setup()
85+
site = web.TCPSite(runner, "0.0.0.0", 0)
86+
await site.start()
87+
port = site._server.sockets[0].getsockname()[1] # type: ignore[union-attr]
88+
try:
89+
yield port
90+
finally:
91+
await runner.cleanup()
92+
93+
94+
# ---------------------------------------------------------------------------
95+
# Fixtures: Prometheus container with custom relabel config
96+
# ---------------------------------------------------------------------------
97+
98+
99+
@pytest.fixture
100+
def prometheus_config_yaml(mock_sd_server: int) -> str:
101+
"""Prometheus config YAML with relabel_configs for loopback address rewrite."""
102+
return textwrap.dedent(f"""\
103+
global:
104+
scrape_interval: 2s
105+
106+
scrape_configs:
107+
- job_name: 'http-sd'
108+
scheme: 'http'
109+
http_sd_configs:
110+
- url: 'http://host.docker.internal:{mock_sd_server}/metrics/service_discovery'
111+
refresh_interval: '2s'
112+
relabel_configs:
113+
- source_labels: [__address__]
114+
regex: '127\\.0\\.0\\.1(.*)'
115+
target_label: __address__
116+
replacement: 'host.docker.internal${{1}}'
117+
""")
118+
119+
120+
@pytest.fixture
121+
def prometheus_config_with_relabel(
122+
prometheus_config_yaml: str,
123+
tmp_path: Path,
124+
) -> Path:
125+
"""Write prometheus config to a temp file for container volume mount."""
126+
config_path = tmp_path / "prometheus.yml"
127+
config_path.write_text(prometheus_config_yaml)
128+
return config_path
129+
130+
131+
@pytest.fixture
132+
def prometheus_with_relabel(
133+
prometheus_config_with_relabel: Path,
134+
) -> Iterator[HostPortPairModel]:
135+
"""Spawn Prometheus container with relabel_configs mounted."""
136+
random_id = secrets.token_hex(8)
137+
container = (
138+
DockerContainer("prom/prometheus:v2.53.0")
139+
.with_name(f"test--prom-relabel-slot-{get_parallel_slot()}-{random_id}")
140+
.with_exposed_ports(9090)
141+
.with_volume_mapping(
142+
str(prometheus_config_with_relabel),
143+
"/etc/prometheus/prometheus.yml",
144+
mode="ro",
145+
)
146+
.with_kwargs(
147+
tmpfs={"/prometheus": "rw,uid=65534,gid=65534"},
148+
extra_hosts={"host.docker.internal": "host-gateway"},
149+
)
150+
.with_command(
151+
"--config.file=/etc/prometheus/prometheus.yml "
152+
"--storage.tsdb.path=/prometheus "
153+
"--storage.tsdb.retention.time=1h"
154+
)
155+
)
156+
container.start()
157+
published_port = int(container.get_exposed_port(9090))
158+
try:
159+
wait_for_logs(container, "Server is ready to receive web requests.", timeout=30)
160+
time.sleep(0.5)
161+
yield HostPortPairModel(host="127.0.0.1", port=published_port)
162+
finally:
163+
container.stop()
164+
165+
166+
@pytest.fixture
167+
async def prometheus_client_with_relabel(
168+
prometheus_with_relabel: HostPortPairModel,
169+
) -> AsyncIterator[PrometheusClient]:
170+
pool = ClientPool(tcp_client_session_factory)
171+
client = PrometheusClient(
172+
endpoint=f"http://{prometheus_with_relabel.host}:{prometheus_with_relabel.port}/api/v1/",
173+
client_pool=pool,
174+
)
175+
try:
176+
yield client
177+
finally:
178+
await pool.close()
179+
180+
181+
# ---------------------------------------------------------------------------
182+
# Tests
183+
# ---------------------------------------------------------------------------
184+
185+
186+
@pytest.fixture
187+
def up_model_service_preset() -> MetricPreset:
188+
return MetricPreset(
189+
template="up{{{labels}}}",
190+
labels={"service_group": LabelMatcher.exact(MODEL_SERVICE_GROUP)},
191+
group_by=frozenset(),
192+
)
193+
194+
195+
class TestLoopbackRelabelScrape:
196+
"""Verify Prometheus scrapes model-service targets after relabel rewrite."""
197+
198+
async def test_prometheus_scrapes_model_service_after_relabel(
199+
self,
200+
prometheus_client_with_relabel: PrometheusClient,
201+
up_model_service_preset: MetricPreset,
202+
) -> None:
203+
"""Model-service metrics are scraped via the relabel-rewritten address."""
204+
# Prometheus needs time to discover targets and scrape
205+
max_attempts = 15
206+
result: PrometheusResponse | None = None
207+
208+
for _ in range(max_attempts):
209+
await asyncio.sleep(2)
210+
result = await prometheus_client_with_relabel.query_instant(up_model_service_preset)
211+
if result.data.result and result.data.result[0].values[-1][1] == "1":
212+
break
213+
214+
assert result is not None
215+
assert len(result.data.result) > 0, (
216+
"Prometheus failed to scrape model-service target after relabel rewrite"
217+
)
218+
metric = result.data.result[0]
219+
assert metric.values[-1][1] == "1"

0 commit comments

Comments
 (0)