Project-N-E-K-O
diff --git a/‎app/memory_server.py‎
Lines changed: 289 additions & 13 deletions b/‎app/memory_server.py‎
Lines changed: 289 additions & 13 deletions
diff --git a/‎config/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎config/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎memory/fact_dedup.py‎
Lines changed: 86 additions & 1 deletion b/‎memory/fact_dedup.py‎
Lines changed: 86 additions & 1 deletion
diff --git a/‎memory/facts.py‎
Lines changed: 23 additions & 0 deletions b/‎memory/facts.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎memory/outbox.py‎
Lines changed: 54 additions & 4 deletions b/‎memory/outbox.py‎
Lines changed: 54 additions & 4 deletions
@@ -1110,6 +1110,18 @@ def translate_value(val):
 - 命中阈值的条目仍保留 schema_version<2（不静默升版洗白），但被 filter
   排除，让循环把名额匀给其它 v1 条目。dev 可读 logger.debug 看积压。"""
 
+MEMORY_LIVENESS_MAX_ATTEMPTS = 5
+"""LLM 终态失败 N 次后强推 progress marker / dead-letter 的统一上限。
+- 适用场景：所有"同点 input + 无 counter + LLM 永久失败 → 永久卡死"的后台
+  路径。包括 signal extraction path A/B、rebuttal feedback、persona
+  corrections resolve、fact dedup resolve、refine cluster、outbox handler。
+- 治理思路：参考 `MEMORY_RECHECK_MAX_ATTEMPTS` (schema 重判 dead-letter) 的
+  套路，把"同一 cursor / 队头 / cluster_hash / op 反复打 LLM"收敛掉，避免
+  毒窗口 / 毒 payload 让整条 pipeline 哑火。
+- 失败定义：LLM 返 None / 抛异常 / handler raise / parse 失败等终态。
+- 5 跟 `MEMORY_RECHECK_MAX_ATTEMPTS` 同口径——按 40s 一轮算 3 分钟级窗口，
+  跨过偶发 transient failure 够用；再多就属于真正 poison。"""
+
 # ---- Memory: followup picker (memory/reflection.py) ─
 REFLECTION_FOLLOWUP_WEIGHTED = True
 """主动搭话 followup 候选采样是否按 evidence_score 加权随机。
 
@@ -43,6 +43,7 @@
 from typing import TYPE_CHECKING
 
 from memory.embeddings import cosine_similarity
+from memory.facts import safe_int_field
 from utils.cloudsave_runtime import MaintenanceModeError, assert_cloudsave_writable
 from utils.file_utils import (
     atomic_write_json_async,
@@ -375,6 +376,7 @@ async def aresolve(self, name: str) -> int:
             return await self._aresolve_locked(name)
 
     async def _aresolve_locked(self, name: str) -> int:
+        from config import MEMORY_LIVENESS_MAX_ATTEMPTS
         from config.prompts.prompts_memory import get_fact_dedup_prompt
         from utils.language_utils import get_global_language
         from utils.llm_client import create_chat_llm
@@ -384,7 +386,18 @@ async def _aresolve_locked(self, name: str) -> int:
         if not pending:
             return 0
 
-        batch = pending[:FACT_DEDUP_BATCH_LIMIT]
+        # Liveness：过滤已达 MEMORY_LIVENESS_MAX_ATTEMPTS 的 dead-letter pair
+        # （防御性——_abump_dedup_attempts_and_dead_letter_locked 命中阈值时直接
+        # 从 queue 删除，正常路径不会让 attempts ≥ MAX 的 entry 还留着）。
+        batch: list[dict] = []
+        for it in pending:
+            if safe_int_field(it, 'resolve_attempts') >= MEMORY_LIVENESS_MAX_ATTEMPTS:
+                continue
+            batch.append(it)
+            if len(batch) >= FACT_DEDUP_BATCH_LIMIT:
+                break
+        if not batch:
+            return 0
         pairs_text = "\n".join(
             f"[{i}] candidate: {item.get('candidate_text', '')}"
             f" | existing: {item.get('existing_text', '')}"
@@ -421,15 +434,38 @@ async def _aresolve_locked(self, name: str) -> int:
                     "[FactDedup] %s: LLM 返回非数组 (%s)，跳过本轮",
                     name, type(results).__name__,
                 )
+                # Parse 失败也算 attempt（same input → same parse failure）；
+                # 跟 Exception 分支同治。
+                await self._abump_dedup_attempts_and_dead_letter_locked(name, batch)
                 return 0
         except Exception as e:
             logger.warning("[FactDedup] %s: LLM 调用失败: %s", name, e)
+            # Liveness 兜底：给本批 pair bump resolve_attempts；达
+            # MEMORY_LIVENESS_MAX_ATTEMPTS 的 entry 从 queue dead-letter
+            # 丢弃。否则毒 pair（safety filter / prompt 过长 / 永远 parse
+            # 不出来）一直占队头让 dedup 永久卡死。caller (aresolve) 已持
+            # 着 _get_alock，这里走 _locked 变体不再重复获取。
+            await self._abump_dedup_attempts_and_dead_letter_locked(name, batch)
             return 0
 
         applied, processed_keys = await self._aapply_decisions(
             name, batch, results,
         )
 
+        # CodeRabbit: LLM 返了 list 但 ``_aapply_decisions`` 没消费任何 pair
+        # （所有 action 都被 reject = unknown action / missing index / invalid
+        # format 等），processed_keys 为空 → 下面的 ``remaining`` filter 不会
+        # 删任何东西 → 队头同一批 pair 下次 tick 重新喂 LLM 同样输出垃圾 →
+        # 永久卡死。算 attempts 一次（跟 LLM Exception / 非 list 同治）。
+        if not processed_keys:
+            logger.warning(
+                "[FactDedup] %s: LLM 输出 %d 条 action 全部无效（unknown action / "
+                "invalid index / conflict）, batch 无任何 pair 消费，按 attempt 失败计",
+                name, len(results),
+            )
+            await self._abump_dedup_attempts_and_dead_letter_locked(name, batch)
+            return 0
+
         # Read-modify-write the queue so concurrent enqueue calls
         # that landed during the LLM call survive — same shape as
         # PersonaManager._resolve_corrections_locked's processed-keys
@@ -463,6 +499,55 @@ async def _aresolve_locked(self, name: str) -> int:
             )
         return applied
 
+    async def _abump_dedup_attempts_and_dead_letter_locked(
+        self, name: str, batch_items: list[dict],
+    ) -> None:
+        """aresolve LLM 失败时的 liveness 兜底（caller MUST hold _get_alock）。
+
+        给本批 pending pair bump ``resolve_attempts``；累计 ≥
+        ``MEMORY_LIVENESS_MAX_ATTEMPTS`` 的 pair 直接从 queue 删除并 WARN。
+
+        Why: 毒 pair（LLM 永远 parse 不出 / safety filter / prompt 过长）让
+        队头每个 tick 都被送进同样 prompt 同样失败 → 整条 dedup pipeline 永久
+        卡死该角色。caller 已持着 _get_alock，所以不再 async with；这跟
+        ``_aresolve_locked`` 里 ``_aapply_decisions`` / ``aload_pending`` /
+        ``_asave_pending`` 全在 lock 内同一规则。
+        """
+        from config import MEMORY_LIVENESS_MAX_ATTEMPTS
+        if not batch_items:
+            return
+        bumped_keys = {
+            (it.get('candidate_id'), it.get('existing_id')) for it in batch_items
+        }
+        bumped_keys.discard((None, None))
+        if not bumped_keys:
+            return
+        current = await self.aload_pending(name)
+        kept: list[dict] = []
+        dropped = 0
+        for it in current:
+            key = (it.get('candidate_id'), it.get('existing_id'))
+            if key in bumped_keys:
+                new_attempts = safe_int_field(it, 'resolve_attempts') + 1
+                if new_attempts >= MEMORY_LIVENESS_MAX_ATTEMPTS:
+                    dropped += 1
+                    logger.warning(
+                        "[FactDedup] %s: dead-letter pair (%s, %s) resolve %d 次失败 ≥ %d，丢弃",
+                        name, key[0], key[1], new_attempts, MEMORY_LIVENESS_MAX_ATTEMPTS,
+                    )
+                    continue
+                it['resolve_attempts'] = new_attempts
+            kept.append(it)
+        if not await self._asave_pending(name, kept):
+            logger.debug(
+                "[FactDedup] %s: 维护态跳过 dedup attempts 写盘", name,
+            )
+        elif dropped:
+            logger.info(
+                "[FactDedup] %s: dead-letter 丢弃 %d 对 dedup pair，剩余队列 %d 条",
+                name, dropped, len(kept),
+            )
+
     # Whitelist of action vocabulary the LLM may return. Anything
     # outside this set (case mismatch, trailing whitespace, localised
     # synonym) is treated as malformed and the queue entry is
 
@@ -74,6 +74,29 @@ def safe_importance(f: dict, default: int = 5) -> int:
         return default
 
 
+def safe_int_field(d: dict, key: str, default: int = 0) -> int:
+    """Defensively coerce ``d[key]`` to int (Codex P2 on PR #1412)。
+
+    Liveness attempt counters (``refine_attempts`` / ``resolve_attempts`` /
+    ``_attempt_count``) 都从 JSON / ndjson 反序列化出来的 dict 字段读，
+    一旦 manual edit / legacy / migration noise 写进 ``""`` / ``"unknown"``
+    / list / dict 等脏值，原 ``int(d.get(key, 0) or 0)`` 会抛 ValueError /
+    TypeError 让整个 list comprehension（候选 gather）挂掉 → 那条 pass
+    永久 fail → liveness 兜底自己变了新的 liveness 缺口。
+
+    跟 ``safe_importance`` 的区别：本 helper 把 ``0`` / ``"0"`` 当合法值返回 0
+    （attempt counter 0 是合法计数），不退 default。``safe_importance`` 把
+    falsy 都退 default 是 importance-specific 语义。
+    """
+    try:
+        val = d.get(key)
+        if val is None:
+            return default
+        return int(val)
+    except (ValueError, TypeError):
+        return default
+
+
 class FactExtractionFailed(RuntimeError):
     """Stage-1 LLM call exhausted retries (RFC §3.4.2 末段).
 
 
@@ -133,6 +133,24 @@ def append_done(self, name: str, op_id: str) -> None:
         with self._get_lock(name):
             self._write_line(self._outbox_path(name), line)
 
+    def append_attempt(self, name: str, op_id: str) -> None:
+        """记录一次 handler 失败 attempt（Site 7 liveness 兜底）。
+
+        scan 时按 op_id 累计 attempt 数；caller (memory_server._run_outbox_op)
+        见累计 ≥ ``MEMORY_LIVENESS_MAX_ATTEMPTS`` 时 append_done 当
+        dead-letter 放弃该 op。否则毒 op（payload 触发 handler 永久 raise）
+        每次重启都重跑、永远不出 pending → ``compact`` 永久阻塞 →
+        outbox.ndjson 线性增长。
+        """
+        record = {
+            'op_id': op_id,
+            'status': 'attempt',
+            'ts': datetime.now().isoformat(),
+        }
+        line = json.dumps(record, ensure_ascii=False)
+        with self._get_lock(name):
+            self._write_line(self._outbox_path(name), line)
+
     # ── scan ────────────────────────────────────────────────────
 
     def _read_all_records(self, path: str) -> list[dict]:
@@ -154,12 +172,19 @@ def _read_all_records(self, path: str) -> list[dict]:
         return records
 
     def pending_ops(self, name: str) -> list[dict]:
-        """返回 pending 且无对应 done 的 op 记录（按登记顺序）。"""
+        """返回 pending 且无对应 done 的 op 记录（按登记顺序）。
+
+        每条返回的 record 会附带非持久化字段 ``_attempt_count``（int），
+        scan 时统计的 ``status='attempt'`` 行数。caller 用它判 dead-letter
+        阈值。返回 dict 是 ``_read_all_records`` 当轮 JSON-load 出的新实例，
+        附 ``_attempt_count`` 不会污染磁盘上的 pending 行。
+        """
         path = self._outbox_path(name)
         with self._get_lock(name):
             records = self._read_all_records(path)
 
         pending: dict[str, dict] = {}
+        attempts: dict[str, int] = {}
         for rec in records:
             op_id = rec.get('op_id')
             status = rec.get('status')
@@ -172,20 +197,32 @@ def pending_ops(self, name: str) -> list[dict]:
                 pending[op_id] = rec
             elif status == 'done':
                 pending.pop(op_id, None)
+                attempts.pop(op_id, None)
+            elif status == 'attempt':
+                attempts[op_id] = attempts.get(op_id, 0) + 1
+        for op_id, rec in pending.items():
+            rec['_attempt_count'] = attempts.get(op_id, 0)
         return list(pending.values())
 
     # ── compact ─────────────────────────────────────────────────
 
     def compact(self, name: str) -> int:
-        """重写 outbox.ndjson，只保留未完成的 pending 行。返回丢弃行数。
+        """重写 outbox.ndjson，只保留未完成的 pending 行 + 它们的 attempt 行。
+        返回丢弃行数。
 
         通过 atomic_write_text 原子替换。compact 期间被 lock 阻塞的 append
         会在 rename 完成后继续到新文件。
+
+        Attempt 行处理（Site 7 liveness）：still-pending 的 op 的 attempt
+        行保留（attempt 计数 → 决定 dead-letter 时机的依据，丢了会让重启后
+        计数器归零）；done 的 op 把它对应的 attempt 行也一并丢（done 后就
+        没有人再读 attempt 计数）。
         """
         path = self._outbox_path(name)
         with self._get_lock(name):
             records = self._read_all_records(path)
             pending: dict[str, dict] = {}
+            attempts_by_op: dict[str, list[dict]] = {}
             for rec in records:
                 op_id = rec.get('op_id')
                 status = rec.get('status')
@@ -195,9 +232,19 @@ def compact(self, name: str) -> int:
                     pending[op_id] = rec
                 elif status == 'done':
                     pending.pop(op_id, None)
+                    attempts_by_op.pop(op_id, None)
+                elif status == 'attempt':
+                    attempts_by_op.setdefault(op_id, []).append(rec)
+
+            kept_records: list[dict] = []
+            for rec in pending.values():
+                kept_records.append(rec)
+            for op_id, attempt_recs in attempts_by_op.items():
+                if op_id in pending:
+                    kept_records.extend(attempt_recs)
 
             total_lines = len(records)
-            kept = len(pending)
+            kept = len(kept_records)
             if total_lines == kept:
                 return 0  # 没有可丢弃的行，避免无用 IO
 
@@ -206,7 +253,7 @@ def compact(self, name: str) -> int:
                 atomic_write_text(path, '', encoding='utf-8')
             else:
                 body = '\n'.join(
-                    json.dumps(r, ensure_ascii=False) for r in pending.values()
+                    json.dumps(r, ensure_ascii=False) for r in kept_records
                 ) + '\n'
                 atomic_write_text(path, body, encoding='utf-8')
             return total_lines - kept
@@ -237,6 +284,9 @@ async def aappend_pending(self, name: str, op_type: str, payload: dict) -> str:
     async def aappend_done(self, name: str, op_id: str) -> None:
         await asyncio.to_thread(self.append_done, name, op_id)
 
+    async def aappend_attempt(self, name: str, op_id: str) -> None:
+        await asyncio.to_thread(self.append_attempt, name, op_id)
+
     async def apending_ops(self, name: str) -> list[dict]:
         return await asyncio.to_thread(self.pending_ops, name)