|
13 | 13 |
|
14 | 14 | import asyncio |
15 | 15 | import logging |
| 16 | +import os |
16 | 17 | import re |
17 | 18 | from dataclasses import dataclass |
18 | 19 | from importlib.resources import files |
@@ -252,23 +253,51 @@ def discover_migrations() -> list[Migration]: |
252 | 253 | return _discover_migrations() |
253 | 254 |
|
254 | 255 |
|
255 | | -CONNECT_RETRY_DEADLINE_S = 60.0 |
256 | | -"""Total wall-clock budget for the initial Postgres connection. Bundled PG |
257 | | -in Kubernetes typically takes 5-15s to be ready (PVC bind, initdb, listener |
258 | | -bind), so the agentevals lifespan can race the database on a fresh deploy. |
259 | | -Retrying tolerates that gap rather than failing pod startup and relying on |
260 | | -CrashLoopBackOff timing to eventually line up.""" |
| 256 | +CONNECT_RETRY_DEADLINE_S = 600.0 |
| 257 | +"""Default total wall-clock budget for the initial Postgres connection. |
| 258 | +Sized to span Kubernetes bring-up of a freshly provisioned database (PVC |
| 259 | +bind, initdb, listener bind, network policy propagation). Override at |
| 260 | +runtime by setting ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` to a positive |
| 261 | +number of seconds; an invalid value logs a warning and falls back to this |
| 262 | +default.""" |
| 263 | + |
| 264 | + |
| 265 | +def connect_deadline_seconds() -> float: |
| 266 | + """Resolve the connect-retry budget. Reads ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` |
| 267 | + and falls back to :data:`CONNECT_RETRY_DEADLINE_S` if the env var is |
| 268 | + unset, empty, non-numeric, or non-positive.""" |
| 269 | + raw = os.getenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S") |
| 270 | + if raw is None or raw == "": |
| 271 | + return CONNECT_RETRY_DEADLINE_S |
| 272 | + try: |
| 273 | + val = float(raw) |
| 274 | + except ValueError: |
| 275 | + logger.warning( |
| 276 | + "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (not a number); using default %.0fs", |
| 277 | + raw, |
| 278 | + CONNECT_RETRY_DEADLINE_S, |
| 279 | + ) |
| 280 | + return CONNECT_RETRY_DEADLINE_S |
| 281 | + if val <= 0: |
| 282 | + logger.warning( |
| 283 | + "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be positive); using default %.0fs", |
| 284 | + raw, |
| 285 | + CONNECT_RETRY_DEADLINE_S, |
| 286 | + ) |
| 287 | + return CONNECT_RETRY_DEADLINE_S |
| 288 | + return val |
261 | 289 |
|
262 | 290 |
|
263 | 291 | async def connect_with_retry(dsn: str, asyncpg_module) -> "asyncpg.Connection": |
264 | 292 | """Open a single asyncpg connection, retrying on connection-refused or |
265 | | - server-not-ready errors for up to ``CONNECT_RETRY_DEADLINE_S`` seconds. |
| 293 | + server-not-ready errors for up to :func:`connect_deadline_seconds` |
| 294 | + seconds. |
266 | 295 |
|
267 | 296 | Connection-time errors are tolerated; once a connection has been |
268 | 297 | established and a query returned, all subsequent failures propagate |
269 | 298 | normally. |
270 | 299 | """ |
271 | | - deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S |
| 300 | + deadline = asyncio.get_event_loop().time() + connect_deadline_seconds() |
272 | 301 | delay = 0.5 |
273 | 302 | while True: |
274 | 303 | try: |
|
0 commit comments