Skip to content

Commit ef6c3a6

Browse files
chore(db): 降低数据库诊断与背压 skip 日志噪音
健康态 pg pool diag 改为 DEBUG 并默认 5 分钟汇总;接话热路径 skip 日志同步降级。 Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent f435c01 commit ef6c3a6

4 files changed

Lines changed: 79 additions & 14 deletions

File tree

src/features/corpus/find_cache.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ async def _cached_find(
6161
now = time.monotonic()
6262
if _reply_db_fail_active(key, now=now):
6363
if for_reply:
64-
logger.info(
64+
logger.debug(
6565
"corpus_find_reply.skip reply_db_fail_cooldown kw_len={}",
6666
len(key),
6767
)
@@ -95,7 +95,7 @@ async def _cached_find(
9595
if is_pg_pool_timeout_error(exc):
9696
mark_reply_db_fail(key)
9797
if for_reply:
98-
logger.info(
98+
logger.debug(
9999
"corpus_find_reply.skip db_timeout kw_len={}",
100100
len(key),
101101
)

src/foundation/db/pool_diagnostics.py

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""PG 连接池诊断:周期汇总 + 慢持连统计(默认 INFO,便于长期观测背压)。"""
1+
"""PG 连接池诊断:周期汇总 + 慢持连统计(健康态 DEBUG,异常态 INFO)。"""
22

33
from __future__ import annotations
44

@@ -12,7 +12,7 @@
1212

1313
from src.foundation.config.repo_settings import repo_env_raw_value
1414

15-
_TICK_SEC = 60.0
15+
_TICK_SEC_DEFAULT = 300.0
1616
_diag_task: asyncio.Task[None] | None = None
1717
_bound = False
1818

@@ -34,6 +34,37 @@ def session_hold_warn_ms() -> float:
3434
return 500.0
3535

3636

37+
def pool_diag_tick_sec() -> float:
38+
raw = repo_env_raw_value("PG_POOL_DIAG_TICK_SEC")
39+
if raw is not None:
40+
try:
41+
return max(30.0, float(str(raw).strip()))
42+
except ValueError:
43+
pass
44+
return _TICK_SEC_DEFAULT
45+
46+
47+
def pool_diag_tick_notable(
48+
*,
49+
under_pressure: bool,
50+
idle_in_tx: int | None,
51+
slow_sessions: int,
52+
remote_skipped_pressure: int,
53+
remote_skipped_busy: int,
54+
mirror_skip: int,
55+
learn_pool_wait: int,
56+
) -> bool:
57+
if under_pressure or slow_sessions > 0:
58+
return True
59+
if idle_in_tx and idle_in_tx > 0:
60+
return True
61+
if remote_skipped_pressure > 0 or remote_skipped_busy > 0:
62+
return True
63+
if mirror_skip > 0 or learn_pool_wait > 0:
64+
return True
65+
return False
66+
67+
3768
def _is_ignored_caller_path(path: str) -> bool:
3869
return path.endswith(_CALLER_SCAN_SKIP_SUFFIXES) or "/site-packages/" in path
3970

@@ -161,7 +192,20 @@ async def emit_pool_diagnostics_tick() -> None:
161192
if not slow_top:
162193
slow_top = "-"
163194

164-
logger.info(
195+
skipped_pressure = int(remote.get("skipped_pressure", 0))
196+
skipped_busy = int(remote.get("skipped_busy", 0))
197+
learn_pool_wait = int(learn.get("learn_pool_wait_spins", 0) or 0)
198+
notable = pool_diag_tick_notable(
199+
under_pressure=bool(budget.get("under_pressure")),
200+
idle_in_tx=idle_tx,
201+
slow_sessions=_slow_session_total,
202+
remote_skipped_pressure=skipped_pressure,
203+
remote_skipped_busy=skipped_busy,
204+
mirror_skip=_mirror_skipped_pressure,
205+
learn_pool_wait=learn_pool_wait,
206+
)
207+
diag_log = logger.info if notable else logger.debug
208+
diag_log(
165209
"pg pool diag: checked_out={}/{} util={} idle_in_tx={} pg_wait=[{}] "
166210
"remote_skip_pressure={} remote_skip_busy={} mirror_skip={} "
167211
"slow_sessions={} slow_max_ms={:.0f} learn_q={} learn_pool_wait={} slow_top=[{}]",
@@ -170,13 +214,13 @@ async def emit_pool_diagnostics_tick() -> None:
170214
util_pct,
171215
idle_tx if idle_tx is not None else "?",
172216
wait_s,
173-
remote.get("skipped_pressure", 0),
174-
remote.get("skipped_busy", 0),
217+
skipped_pressure,
218+
skipped_busy,
175219
_mirror_skipped_pressure,
176220
_slow_session_total,
177221
_slow_hold_max_ms,
178222
learn.get("learn_queue_size", "?"),
179-
learn.get("learn_pool_wait_spins", 0),
223+
learn_pool_wait,
180224
slow_top,
181225
)
182226

@@ -195,13 +239,13 @@ async def emit_pool_diagnostics_tick() -> None:
195239

196240

197241
async def pool_diagnostics_loop() -> None:
198-
await asyncio.sleep(_TICK_SEC)
242+
await asyncio.sleep(pool_diag_tick_sec())
199243
while True:
200244
try:
201245
await emit_pool_diagnostics_tick()
202246
except Exception as e:
203247
logger.warning("pg pool diagnostics tick failed: {}", e)
204-
await asyncio.sleep(_TICK_SEC)
248+
await asyncio.sleep(pool_diag_tick_sec())
205249

206250

207251
def bind_pg_pool_diagnostics() -> None:
@@ -228,7 +272,7 @@ def start_pg_pool_diagnostics_task() -> None:
228272
_diag_task = asyncio.create_task(pool_diagnostics_loop(), name="pg_pool_diagnostics")
229273
logger.debug(
230274
"pg pool diagnostics started (tick={}s, session_hold_warn={}ms)",
231-
int(_TICK_SEC),
275+
int(pool_diag_tick_sec()),
232276
int(session_hold_warn_ms()),
233277
)
234278

src/foundation/db/repository_pg.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -586,13 +586,13 @@ async def cached_reply_query_snapshot(
586586
if not key:
587587
return None
588588
if pg_pool_under_pressure(threshold=0.55):
589-
logger.info(
589+
logger.debug(
590590
"reply_query_snapshot.skip pg_pool_pressure kw_len={}",
591591
len(key),
592592
)
593593
return None
594594
if reply_db_fail_active(key):
595-
logger.info(
595+
logger.debug(
596596
"reply_query_snapshot.skip reply_db_fail_cooldown kw_len={}",
597597
len(key),
598598
)
@@ -620,7 +620,7 @@ async def cached_reply_query_snapshot(
620620
_reply_query_snapshot_inflight.pop(key, None)
621621
if is_pg_pool_timeout_error(exc):
622622
mark_reply_db_fail(key)
623-
logger.info(
623+
logger.debug(
624624
"reply_query_snapshot.skip db_timeout kw_len={}",
625625
len(key),
626626
)

tests/common/test_pool_diagnostics.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,24 @@ class _WantedFrame:
3333
monkeypatch.setattr(pool_diagnostics.inspect, "stack", lambda: [None, None, _FrameInfo(), _WantedFrame()])
3434

3535
assert pool_diagnostics.pg_session_caller_hint_entry() == "fallback_wrapper@common/test_pool_diagnostics.py:27"
36+
37+
38+
def test_pool_diag_tick_notable_only_when_anomaly() -> None:
39+
base = dict(
40+
under_pressure=False,
41+
idle_in_tx=0,
42+
slow_sessions=0,
43+
remote_skipped_pressure=0,
44+
remote_skipped_busy=0,
45+
mirror_skip=0,
46+
learn_pool_wait=0,
47+
)
48+
assert pool_diagnostics.pool_diag_tick_notable(**base) is False
49+
assert pool_diagnostics.pool_diag_tick_notable(**{**base, "under_pressure": True}) is True
50+
assert pool_diagnostics.pool_diag_tick_notable(**{**base, "remote_skipped_busy": 3}) is True
51+
52+
53+
def test_pool_diag_tick_sec_default(monkeypatch) -> None:
54+
monkeypatch.delenv("PG_POOL_DIAG_TICK_SEC", raising=False)
55+
monkeypatch.setattr(pool_diagnostics, "repo_env_raw_value", lambda _k: None)
56+
assert pool_diagnostics.pool_diag_tick_sec() == 300.0

0 commit comments

Comments
 (0)