|
| 1 | +"""Component tests: Prometheus scrapes model-service targets after relabel rewrite.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import asyncio |
| 6 | +import secrets |
| 7 | +import textwrap |
| 8 | +import time |
| 9 | +from collections.abc import AsyncIterator, Iterator |
| 10 | +from pathlib import Path |
| 11 | + |
| 12 | +import pytest |
| 13 | +from aiohttp import web |
| 14 | +from testcontainers.core.container import DockerContainer |
| 15 | +from testcontainers.core.waiting_utils import wait_for_logs |
| 16 | + |
| 17 | +from ai.backend.common.clients.http_client.client_pool import ( |
| 18 | + ClientPool, |
| 19 | + tcp_client_session_factory, |
| 20 | +) |
| 21 | +from ai.backend.common.clients.prometheus import LabelMatcher, MetricPreset, PrometheusClient |
| 22 | +from ai.backend.common.dto.clients.prometheus import PrometheusResponse |
| 23 | +from ai.backend.common.service_discovery.service_discovery import MODEL_SERVICE_GROUP |
| 24 | +from ai.backend.common.typed_validators import HostPortPair as HostPortPairModel |
| 25 | +from ai.backend.testutils.pants import get_parallel_slot |
| 26 | + |
| 27 | +# --------------------------------------------------------------------------- |
| 28 | +# Fixtures: mock HTTP servers (SD endpoint + metrics endpoint) |
| 29 | +# --------------------------------------------------------------------------- |
| 30 | + |
| 31 | + |
| 32 | +@pytest.fixture |
| 33 | +async def mock_metrics_server() -> AsyncIterator[int]: |
| 34 | + """A minimal /metrics endpoint that Prometheus can scrape.""" |
| 35 | + |
| 36 | + async def handle_metrics(_request: web.Request) -> web.Response: |
| 37 | + return web.Response( |
| 38 | + text=("# HELP test_gauge A test gauge\n# TYPE test_gauge gauge\ntest_gauge 42\n"), |
| 39 | + content_type="text/plain", |
| 40 | + ) |
| 41 | + |
| 42 | + app = web.Application() |
| 43 | + app.router.add_get("/metrics", handle_metrics) |
| 44 | + runner = web.AppRunner(app) |
| 45 | + await runner.setup() |
| 46 | + site = web.TCPSite(runner, "0.0.0.0", 0) |
| 47 | + await site.start() |
| 48 | + port = site._server.sockets[0].getsockname()[1] # type: ignore[union-attr] |
| 49 | + try: |
| 50 | + yield port |
| 51 | + finally: |
| 52 | + await runner.cleanup() |
| 53 | + |
| 54 | + |
| 55 | +@pytest.fixture |
| 56 | +async def mock_sd_server(mock_metrics_server: int) -> AsyncIterator[int]: |
| 57 | + """HTTP SD endpoint returning a model-service target with Docker-internal IP. |
| 58 | +
|
| 59 | + The target address uses 127.0.0.1 (simulating a Docker-internal IP), |
| 60 | + which will be rewritten by relabel_configs to host.docker.internal. |
| 61 | + The actual metrics server runs on the same port, reachable via |
| 62 | + host.docker.internal from the Prometheus container. |
| 63 | + """ |
| 64 | + |
| 65 | + async def handle_sd(_request: web.Request) -> web.Response: |
| 66 | + # 127.0.0.1 simulates a Docker-internal IP that is unreachable |
| 67 | + # from the Prometheus container. The relabel rule will rewrite it |
| 68 | + # to host.docker.internal, which IS reachable. |
| 69 | + return web.json_response([ |
| 70 | + { |
| 71 | + "targets": [f"127.0.0.1:{mock_metrics_server}"], |
| 72 | + "labels": { |
| 73 | + "service_group": MODEL_SERVICE_GROUP, |
| 74 | + "service_id": "test-model-svc-001", |
| 75 | + "display_name": "test-model", |
| 76 | + "version": "1.0", |
| 77 | + }, |
| 78 | + }, |
| 79 | + ]) |
| 80 | + |
| 81 | + app = web.Application() |
| 82 | + app.router.add_get("/metrics/service_discovery", handle_sd) |
| 83 | + runner = web.AppRunner(app) |
| 84 | + await runner.setup() |
| 85 | + site = web.TCPSite(runner, "0.0.0.0", 0) |
| 86 | + await site.start() |
| 87 | + port = site._server.sockets[0].getsockname()[1] # type: ignore[union-attr] |
| 88 | + try: |
| 89 | + yield port |
| 90 | + finally: |
| 91 | + await runner.cleanup() |
| 92 | + |
| 93 | + |
| 94 | +# --------------------------------------------------------------------------- |
| 95 | +# Fixtures: Prometheus container with custom relabel config |
| 96 | +# --------------------------------------------------------------------------- |
| 97 | + |
| 98 | + |
| 99 | +@pytest.fixture |
| 100 | +def prometheus_config_yaml(mock_sd_server: int) -> str: |
| 101 | + """Prometheus config YAML with relabel_configs for loopback address rewrite.""" |
| 102 | + return textwrap.dedent(f"""\ |
| 103 | + global: |
| 104 | + scrape_interval: 2s |
| 105 | +
|
| 106 | + scrape_configs: |
| 107 | + - job_name: 'http-sd' |
| 108 | + scheme: 'http' |
| 109 | + http_sd_configs: |
| 110 | + - url: 'http://host.docker.internal:{mock_sd_server}/metrics/service_discovery' |
| 111 | + refresh_interval: '2s' |
| 112 | + relabel_configs: |
| 113 | + - source_labels: [__address__] |
| 114 | + regex: '127\\.0\\.0\\.1(.*)' |
| 115 | + target_label: __address__ |
| 116 | + replacement: 'host.docker.internal${{1}}' |
| 117 | + """) |
| 118 | + |
| 119 | + |
| 120 | +@pytest.fixture |
| 121 | +def prometheus_config_with_relabel( |
| 122 | + prometheus_config_yaml: str, |
| 123 | + tmp_path: Path, |
| 124 | +) -> Path: |
| 125 | + """Write prometheus config to a temp file for container volume mount.""" |
| 126 | + config_path = tmp_path / "prometheus.yml" |
| 127 | + config_path.write_text(prometheus_config_yaml) |
| 128 | + return config_path |
| 129 | + |
| 130 | + |
| 131 | +@pytest.fixture |
| 132 | +def prometheus_with_relabel( |
| 133 | + prometheus_config_with_relabel: Path, |
| 134 | +) -> Iterator[HostPortPairModel]: |
| 135 | + """Spawn Prometheus container with relabel_configs mounted.""" |
| 136 | + random_id = secrets.token_hex(8) |
| 137 | + container = ( |
| 138 | + DockerContainer("prom/prometheus:v2.53.0") |
| 139 | + .with_name(f"test--prom-relabel-slot-{get_parallel_slot()}-{random_id}") |
| 140 | + .with_exposed_ports(9090) |
| 141 | + .with_volume_mapping( |
| 142 | + str(prometheus_config_with_relabel), |
| 143 | + "/etc/prometheus/prometheus.yml", |
| 144 | + mode="ro", |
| 145 | + ) |
| 146 | + .with_kwargs( |
| 147 | + tmpfs={"/prometheus": "rw,uid=65534,gid=65534"}, |
| 148 | + extra_hosts={"host.docker.internal": "host-gateway"}, |
| 149 | + ) |
| 150 | + .with_command( |
| 151 | + "--config.file=/etc/prometheus/prometheus.yml " |
| 152 | + "--storage.tsdb.path=/prometheus " |
| 153 | + "--storage.tsdb.retention.time=1h" |
| 154 | + ) |
| 155 | + ) |
| 156 | + container.start() |
| 157 | + published_port = int(container.get_exposed_port(9090)) |
| 158 | + try: |
| 159 | + wait_for_logs(container, "Server is ready to receive web requests.", timeout=30) |
| 160 | + time.sleep(0.5) |
| 161 | + yield HostPortPairModel(host="127.0.0.1", port=published_port) |
| 162 | + finally: |
| 163 | + container.stop() |
| 164 | + |
| 165 | + |
| 166 | +@pytest.fixture |
| 167 | +async def prometheus_client_with_relabel( |
| 168 | + prometheus_with_relabel: HostPortPairModel, |
| 169 | +) -> AsyncIterator[PrometheusClient]: |
| 170 | + pool = ClientPool(tcp_client_session_factory) |
| 171 | + client = PrometheusClient( |
| 172 | + endpoint=f"http://{prometheus_with_relabel.host}:{prometheus_with_relabel.port}/api/v1/", |
| 173 | + client_pool=pool, |
| 174 | + ) |
| 175 | + try: |
| 176 | + yield client |
| 177 | + finally: |
| 178 | + await pool.close() |
| 179 | + |
| 180 | + |
| 181 | +# --------------------------------------------------------------------------- |
| 182 | +# Tests |
| 183 | +# --------------------------------------------------------------------------- |
| 184 | + |
| 185 | + |
| 186 | +@pytest.fixture |
| 187 | +def up_model_service_preset() -> MetricPreset: |
| 188 | + return MetricPreset( |
| 189 | + template="up{{{labels}}}", |
| 190 | + labels={"service_group": LabelMatcher.exact(MODEL_SERVICE_GROUP)}, |
| 191 | + group_by=frozenset(), |
| 192 | + ) |
| 193 | + |
| 194 | + |
| 195 | +class TestLoopbackRelabelScrape: |
| 196 | + """Verify Prometheus scrapes model-service targets after relabel rewrite.""" |
| 197 | + |
| 198 | + async def test_prometheus_scrapes_model_service_after_relabel( |
| 199 | + self, |
| 200 | + prometheus_client_with_relabel: PrometheusClient, |
| 201 | + up_model_service_preset: MetricPreset, |
| 202 | + ) -> None: |
| 203 | + """Model-service metrics are scraped via the relabel-rewritten address.""" |
| 204 | + # Prometheus needs time to discover targets and scrape |
| 205 | + max_attempts = 15 |
| 206 | + result: PrometheusResponse | None = None |
| 207 | + |
| 208 | + for _ in range(max_attempts): |
| 209 | + await asyncio.sleep(2) |
| 210 | + result = await prometheus_client_with_relabel.query_instant(up_model_service_preset) |
| 211 | + if result.data.result and result.data.result[0].values[-1][1] == "1": |
| 212 | + break |
| 213 | + |
| 214 | + assert result is not None |
| 215 | + assert len(result.data.result) > 0, ( |
| 216 | + "Prometheus failed to scrape model-service target after relabel rewrite" |
| 217 | + ) |
| 218 | + metric = result.data.result[0] |
| 219 | + assert metric.values[-1][1] == "1" |
0 commit comments