Skip to content

Commit b5dedba

Browse files
committed
fix RSS spike.
1 parent d117df7 commit b5dedba

6 files changed

Lines changed: 411 additions & 8 deletions

File tree

agent/ec_skills/browser_use_extension/extension_tools_service.py

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,25 @@
8080
)
8181
except Exception:
8282
_CDP_EVALUATE_RECOVERY_THRESHOLD = 2
83+
try:
84+
_FEIGE_CDP_EVALUATE_RECOVERY_THRESHOLD = max(
85+
0, int(os.getenv("ECAN_FEIGE_CDP_EVALUATE_RECOVERY_THRESHOLD", "1"))
86+
)
87+
except Exception:
88+
_FEIGE_CDP_EVALUATE_RECOVERY_THRESHOLD = 1
89+
try:
90+
_FEIGE_CDP_HEALTH_COOLDOWN_S = max(
91+
0.0, float(os.getenv("ECAN_FEIGE_CDP_HEALTH_COOLDOWN_S", "25.0"))
92+
)
93+
except Exception:
94+
_FEIGE_CDP_HEALTH_COOLDOWN_S = 25.0
8395
_CDP_EVALUATE_TIMEOUT_RECOVERY_LOCK = threading.Lock()
8496
_CDP_EVALUATE_TIMEOUT_RECOVERY: Dict[int, int] = {}
8597
_FEIGE_SEND_CDP_TIMEOUT_LOCK = threading.Lock()
8698
_FEIGE_SEND_CDP_TIMEOUT_UNTIL = 0.0
99+
_FEIGE_CDP_HEALTH_LOCK = threading.Lock()
100+
_FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL = 0.0
101+
_FEIGE_CDP_HEALTH_REASON = ""
87102
from agent.ec_skills.label_utils.print_label import (
88103
print_labels_async,
89104
reformat_labels_async,
@@ -1006,13 +1021,54 @@ def _record_feige_send_cdp_success() -> None:
10061021
_FEIGE_SEND_CDP_TIMEOUT_UNTIL = 0.0
10071022

10081023

1024+
def feige_cdp_health_cooldown_remaining() -> float:
1025+
now = _time.monotonic()
1026+
with _FEIGE_CDP_HEALTH_LOCK:
1027+
remaining = _FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL - now
1028+
return remaining if remaining > 0.0 else 0.0
1029+
1030+
1031+
def mark_feige_cdp_unhealthy(reason: str = "", *, cooldown_s: float | None = None) -> float:
1032+
global _FEIGE_CDP_HEALTH_REASON
1033+
global _FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL
1034+
cooldown = _FEIGE_CDP_HEALTH_COOLDOWN_S if cooldown_s is None else max(0.0, float(cooldown_s))
1035+
if cooldown <= 0.0:
1036+
return 0.0
1037+
now = _time.monotonic()
1038+
until = now + cooldown
1039+
with _FEIGE_CDP_HEALTH_LOCK:
1040+
_FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL = max(_FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL, until)
1041+
if reason:
1042+
_FEIGE_CDP_HEALTH_REASON = str(reason)
1043+
remaining = _FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL - now
1044+
logger.warning(
1045+
f"[Feige] CDP health cooldown active for {remaining:.1f}s "
1046+
f"reason={_FEIGE_CDP_HEALTH_REASON!r}"
1047+
)
1048+
return remaining if remaining > 0.0 else 0.0
1049+
1050+
1051+
def mark_feige_cdp_healthy() -> None:
1052+
global _FEIGE_CDP_HEALTH_REASON
1053+
global _FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL
1054+
with _FEIGE_CDP_HEALTH_LOCK:
1055+
_FEIGE_CDP_HEALTH_UNHEALTHY_UNTIL = 0.0
1056+
_FEIGE_CDP_HEALTH_REASON = ""
1057+
1058+
10091059
def _record_cdp_evaluate_recovery_signal(browser_session: Any, trace_label: str, phase: str) -> None:
1010-
if _CDP_EVALUATE_RECOVERY_THRESHOLD <= 0 or browser_session is None:
1060+
label = str(trace_label or "")
1061+
threshold = (
1062+
_FEIGE_CDP_EVALUATE_RECOVERY_THRESHOLD
1063+
if label.startswith("feige_")
1064+
else _CDP_EVALUATE_RECOVERY_THRESHOLD
1065+
)
1066+
if threshold <= 0 or browser_session is None:
10111067
return
10121068
session_key = id(browser_session)
10131069
with _CDP_EVALUATE_TIMEOUT_RECOVERY_LOCK:
10141070
count = _CDP_EVALUATE_TIMEOUT_RECOVERY.get(session_key, 0) + 1
1015-
if count < _CDP_EVALUATE_RECOVERY_THRESHOLD:
1071+
if count < threshold:
10161072
_CDP_EVALUATE_TIMEOUT_RECOVERY[session_key] = count
10171073
return
10181074
_CDP_EVALUATE_TIMEOUT_RECOVERY.pop(session_key, None)
@@ -1341,6 +1397,10 @@ async def _run_with_optional_operation_lock() -> Any:
13411397
timings["pending_pruned_on_timeout"] = _prune_cdp_pending_requests(
13421398
cdp_client_ref
13431399
)
1400+
if str(trace_label or "").startswith("feige_"):
1401+
mark_feige_cdp_unhealthy(
1402+
f"{trace_label or 'feige'}:{current_phase}:timeout"
1403+
)
13441404
_record_cdp_evaluate_recovery_signal(browser_session, trace_label, current_phase)
13451405
_emit_trace(
13461406
ok=False,
@@ -1354,6 +1414,8 @@ async def _run_with_optional_operation_lock() -> Any:
13541414
except Exception as exc:
13551415
_emit_trace(ok=False, timed_out=False, error=str(exc))
13561416
raise
1417+
if str(trace_label or "").startswith("feige_"):
1418+
mark_feige_cdp_healthy()
13571419
_emit_trace(ok=True, timed_out=False)
13581420
value = result.get("result", {}).get("value", "")
13591421
if isinstance(value, str):
@@ -3259,6 +3321,19 @@ async def feige_list_sessions(params: FeigeListSessionsAction, browser_session:
32593321
)
32603322
async def feige_open_session(params: FeigeOpenSessionAction, browser_session: BrowserSession) -> ActionResult:
32613323
try:
3324+
cooldown_remaining = feige_cdp_health_cooldown_remaining()
3325+
if cooldown_remaining > 0.0:
3326+
logger.warning(
3327+
f"[Feige] feige_open_session: CDP health cooldown active "
3328+
f"for {cooldown_remaining:.1f}s; skipping open for "
3329+
f"{str(params.customer_name or '')!r}"
3330+
)
3331+
return ActionResult(
3332+
error=(
3333+
"feige_open_session: cdp_health_cooldown_active "
3334+
f"{cooldown_remaining:.1f}s"
3335+
)
3336+
)
32623337
name_js = json.dumps(params.customer_name, ensure_ascii=False) if params.customer_name else "null"
32633338
idx_js = str(params.session_index) if params.session_index is not None else "-1"
32643339
js = _FEIGE_OPEN_SESSION_JS.replace("CUSTOMER_NAME", name_js).replace("SESSION_INDEX", idx_js)
@@ -3892,10 +3967,13 @@ async def feige_send_message(params: FeigeSendMessageAction, browser_session: Br
38923967
response_preview=str(getattr(params, "text", "") or ""),
38933968
response_len=len(str(getattr(params, "text", "") or "")),
38943969
)
3895-
cooldown_remaining = _feige_send_cdp_timeout_remaining()
3970+
cooldown_remaining = max(
3971+
_feige_send_cdp_timeout_remaining(),
3972+
feige_cdp_health_cooldown_remaining(),
3973+
)
38963974
if cooldown_remaining > 0.0:
38973975
logger.warning(
3898-
f"[Feige] feige_send_message: CDP timeout cooldown active "
3976+
f"[Feige] feige_send_message: CDP cooldown active "
38993977
f"for {cooldown_remaining:.1f}s; skipping send for "
39003978
f"{expected_customer!r}"
39013979
)

agent/ec_skills/browser_use_extension/hooks/external/feige_chat/front_desk_hot_path_v2.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,48 @@ def _ledger(
569569
typing_acquired=bool(outcome.typing_acquired),
570570
)
571571

572+
if not outcome.ok and outcome.reason == "cdp_health_cooldown_active":
573+
if claim_active:
574+
try:
575+
_ds.unclaim_send_for_turn(
576+
claim_cust,
577+
claim_reply,
578+
claim_source_msg_id,
579+
)
580+
except Exception:
581+
pass
582+
claim_active = False
583+
try:
584+
defer_cust = ctx.normalize_dispatch_identity_key(
585+
payload.get("customer_name")
586+
or payload.get("customer_id")
587+
or ""
588+
)
589+
if defer_cust:
590+
ctx.dispatch_state.clear_inflight(defer_cust)
591+
except Exception:
592+
pass
593+
_ledger(
594+
"hot_path_b_cdp_health_deferred",
595+
reason=str(outcome.reason or ""),
596+
last_tool_error=str(outcome.last_tool_error or ""),
597+
cooldown_remaining_s=outcome.extras.get("cooldown_remaining_s"),
598+
level=logging.WARNING,
599+
)
600+
state.setdefault("result", {})["llm_result"] = {
601+
"all_done": False,
602+
"work_done": False,
603+
"hot_path": True,
604+
"hot_path_type": "cdp_cooldown_deferred",
605+
"hot_path_reason": str(outcome.reason or ""),
606+
"last_tool_error": str(outcome.last_tool_error or ""),
607+
}
608+
logger.warning(
609+
f"[HOT-PATH-B-V2] deferred send while Feige CDP health "
610+
f"cooldown is active, node={ctx.node_name}"
611+
)
612+
return state
613+
572614
if not outcome.ok and outcome.reason == "stale_reply_source_msg_id":
573615
# Keep the recent-send claim so this stale response is not
574616
# replayed, but avoid clearing a newer dispatch lock if the

agent/ec_skills/browser_use_extension/hooks/external/feige_chat/hot_path_v2.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,17 @@ def verify_customer_match(data: dict, expected: str) -> tuple[bool, str]: # typ
125125
return (active == str(expected or "").strip(), f"legacy-active={active!r}")
126126

127127

128+
def _feige_cdp_health_cooldown_remaining() -> float:
129+
try:
130+
from agent.ec_skills.browser_use_extension import extension_tools_service as _ets
131+
remaining_fn = getattr(_ets, "feige_cdp_health_cooldown_remaining", None)
132+
if callable(remaining_fn):
133+
return max(0.0, float(remaining_fn()))
134+
except Exception:
135+
pass
136+
return 0.0
137+
138+
128139
# ============================================================================
129140
# ToolInvoker Protocol — replaces legacy ``actions_registry`` + ``inspect``
130141
# ============================================================================
@@ -671,6 +682,17 @@ async def execute_v2(
671682
them affect the decision tree.
672683
"""
673684
outcome = HotPathOutcomeV2()
685+
cooldown_remaining = _feige_cdp_health_cooldown_remaining()
686+
if cooldown_remaining > 0.0:
687+
outcome.ok = False
688+
outcome.reason = "cdp_health_cooldown_active"
689+
outcome.last_tool_error = f"cdp_health_cooldown_active {cooldown_remaining:.1f}s"
690+
outcome.extras["cooldown_remaining_s"] = round(cooldown_remaining, 3)
691+
logger.warning(
692+
f"[hot_path_v2] Feige CDP health cooldown active for "
693+
f"{cooldown_remaining:.1f}s; deferring guarded send, node={node_name}"
694+
)
695+
return outcome
674696
outcome.typing_acquired = await _acquire_typing_lock(
675697
typing_lock, customer_key, node_name,
676698
)

0 commit comments

Comments
 (0)