Skip to content

Commit 3881f4a

Browse files
make Postgres connect timeout configurable
1 parent 4406260 commit 3881f4a

4 files changed

Lines changed: 46 additions & 10 deletions

File tree

charts/agentevals/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ spec:
7777
value: {{ .Values.database.postgres.schema | quote }}
7878
- name: AGENTEVALS_AUTO_MIGRATE
7979
value: {{ .Values.database.postgres.autoMigrate | quote }}
80+
- name: AGENTEVALS_DB_CONNECT_TIMEOUT_S
81+
value: {{ .Values.database.postgres.connectTimeoutSeconds | quote }}
8082
{{- if .Values.database.postgres.urlFile }}
8183
- name: AGENTEVALS_DATABASE_URL_FILE
8284
value: {{ .Values.database.postgres.urlFile | quote }}

charts/agentevals/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,11 @@ database:
211211
# false the server refuses to start if the schema is behind or dirty;
212212
# run "agentevals migrate up" manually in that case.
213213
autoMigrate: true
214+
# -- Seconds the startup will spend retrying the initial Postgres
215+
# connection before the pod aborts. Default 600s spans bring-up of a
216+
# freshly provisioned database. Raise startupProbe.failureThreshold
217+
# to match if you increase this.
218+
connectTimeoutSeconds: 600
214219
# -- Bundled Postgres instance for development and evaluation only.
215220
# Not suitable for production. Deployed when enabled is true and url /
216221
# urlFile are not set.

src/agentevals/storage/postgres/migrator.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import asyncio
1515
import logging
16+
import os
1617
import re
1718
from dataclasses import dataclass
1819
from importlib.resources import files
@@ -252,23 +253,51 @@ def discover_migrations() -> list[Migration]:
252253
return _discover_migrations()
253254

254255

255-
CONNECT_RETRY_DEADLINE_S = 60.0
256-
"""Total wall-clock budget for the initial Postgres connection. Bundled PG
257-
in Kubernetes typically takes 5-15s to be ready (PVC bind, initdb, listener
258-
bind), so the agentevals lifespan can race the database on a fresh deploy.
259-
Retrying tolerates that gap rather than failing pod startup and relying on
260-
CrashLoopBackOff timing to eventually line up."""
256+
CONNECT_RETRY_DEADLINE_S = 600.0
257+
"""Default total wall-clock budget for the initial Postgres connection.
258+
Sized to span Kubernetes bring-up of a freshly provisioned database (PVC
259+
bind, initdb, listener bind, network policy propagation). Override at
260+
runtime by setting ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` to a positive
261+
number of seconds; an invalid value logs a warning and falls back to this
262+
default."""
263+
264+
265+
def connect_deadline_seconds() -> float:
266+
"""Resolve the connect-retry budget. Reads ``AGENTEVALS_DB_CONNECT_TIMEOUT_S``
267+
and falls back to :data:`CONNECT_RETRY_DEADLINE_S` if the env var is
268+
unset, empty, non-numeric, or non-positive."""
269+
raw = os.getenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S")
270+
if raw is None or raw == "":
271+
return CONNECT_RETRY_DEADLINE_S
272+
try:
273+
val = float(raw)
274+
except ValueError:
275+
logger.warning(
276+
"Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (not a number); using default %.0fs",
277+
raw,
278+
CONNECT_RETRY_DEADLINE_S,
279+
)
280+
return CONNECT_RETRY_DEADLINE_S
281+
if val <= 0:
282+
logger.warning(
283+
"Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be positive); using default %.0fs",
284+
raw,
285+
CONNECT_RETRY_DEADLINE_S,
286+
)
287+
return CONNECT_RETRY_DEADLINE_S
288+
return val
261289

262290

263291
async def connect_with_retry(dsn: str, asyncpg_module) -> "asyncpg.Connection":
264292
"""Open a single asyncpg connection, retrying on connection-refused or
265-
server-not-ready errors for up to ``CONNECT_RETRY_DEADLINE_S`` seconds.
293+
server-not-ready errors for up to :func:`connect_deadline_seconds`
294+
seconds.
266295
267296
Connection-time errors are tolerated; once a connection has been
268297
established and a query returned, all subsequent failures propagate
269298
normally.
270299
"""
271-
deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
300+
deadline = asyncio.get_event_loop().time() + connect_deadline_seconds()
272301
delay = 0.5
273302
while True:
274303
try:

src/agentevals/storage/postgres/pool.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ async def create_pool(settings: StorageSettings) -> "asyncpg.Pool":
5050
settings.schema_name,
5151
)
5252

53-
from .migrator import CONNECT_RETRY_DEADLINE_S
53+
from .migrator import connect_deadline_seconds
5454

55-
deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
55+
deadline = asyncio.get_event_loop().time() + connect_deadline_seconds()
5656
delay = 0.5
5757
while True:
5858
try:

0 commit comments

Comments
 (0)