fix(kb): address PR xorbitsai#202 review — harden replace_chunks and add ingestion lock

sqhyz55 · sqhyz55 · commit cbe7aca03ae7 · 2026-05-12T18:27:08.000+08:00
- Reject empty replace_scope to prevent unbounded deletion (comment 1)
- Use merge_insert instead of table.add for idempotent upsert (comment 2)
- Validate chunk_id presence before write to fail fast (comment 3)
- Fix docstring to match insert-before-delete order (comment 4)
- Cascade-delete orphaned embeddings rows after chunk replacement (comment 5)
- Allow replace_chunks with empty records to clean stale data (comment 6)
- Add per-document threading lock in process_document to prevent
  concurrent chunk replacement races (comment 7)
- Update tests for merge_insert and add ingestion lock tests
diff --git a/src/xagent/core/tools/core/RAG_tools/chunk/chunk_document.py b/src/xagent/core/tools/core/RAG_tools/chunk/chunk_document.py
@@ -88,11 +88,10 @@ def chunk_document(
     Chunk parsed paragraphs and write to chunks table.
 
     Concurrency note:
-        This function uses **last-write-wins** semantics. If two concurrent
-        calls target the same (collection, doc_id, parse_hash) with different
-        parameters, the last one to reach ``_write_chunks_to_db`` will replace
-        the other's output. Callers requiring mutual exclusion should hold a
-        collection-level lock before invoking this function.
+        This function uses **last-write-wins** semantics. The ingestion
+        pipeline serialises calls per (collection, source_path) via
+        ``_get_ingestion_lock``, so concurrent chunk replacement races are
+        prevented at the pipeline level.
 
     Args:
         collection: Collection name for data isolation
@@ -512,9 +511,6 @@ def _write_chunks_to_db(
             }
             rows.append(row)
 
-        if not rows:
-            return False
-
         vector_store = get_vector_index_store()
         vector_store.replace_chunks(
             rows,
@@ -527,6 +523,9 @@ def _write_chunks_to_db(
             is_admin=is_admin,
         )
 
+        if not rows:
+            return False
+
         logger.info(
             "Chunk records written to database: doc_id=%s, parse_hash=%s, config_hash=%s",
             doc_id,
diff --git a/src/xagent/core/tools/core/RAG_tools/pipelines/document_ingestion.py b/src/xagent/core/tools/core/RAG_tools/pipelines/document_ingestion.py
@@ -5,6 +5,7 @@
 import asyncio
 import logging
 import os
+import threading
 import time
 from contextlib import contextmanager
 from pathlib import Path
@@ -62,6 +63,24 @@
 
 logger = logging.getLogger(__name__)
 
+# Per-document ingestion lock keyed by (collection, source_path).
+# Prevents concurrent ingestion of the same document which can cause
+# data loss in the chunk replacement flow (see PR #202 comment 7).
+_ingestion_locks: Dict[tuple, threading.Lock] = {}
+_ingestion_locks_guard = threading.Lock()
+
+
+def _get_ingestion_lock(collection: str, source_path: str) -> threading.Lock:
+    """Get or create a per-document threading lock."""
+    key = (collection, source_path)
+    if key in _ingestion_locks:
+        return _ingestion_locks[key]
+    with _ingestion_locks_guard:
+        if key not in _ingestion_locks:
+            _ingestion_locks[key] = threading.Lock()
+        return _ingestion_locks[key]
+
+
 _SPREADSHEET_EXTENSIONS = {".xlsx", ".xls", ".csv"}
 _SPREADSHEET_CHUNK_SIZE_TOKENS = 512
 _SPREADSHEET_CHUNK_OVERLAP_TOKENS = 64
@@ -469,7 +488,33 @@ def process_document(
           inputs will reuse existing records when possible.
         - Downstream API layers should surface `result.failed_step` and
           `result.warnings` to callers for better observability.
+        - A per-document lock serialises concurrent calls for the same
+          (collection, source_path) to prevent chunk replacement races.
     """
+    lock = _get_ingestion_lock(collection, source_path)
+    with lock:
+        return _process_document_impl(
+            collection,
+            source_path,
+            config=config,
+            progress_manager=progress_manager,
+            user_id=user_id,
+            is_admin=is_admin,
+            file_id=file_id,
+        )
+
+
+def _process_document_impl(
+    collection: str,
+    source_path: str,
+    *,
+    config: Optional[IngestionConfig] = None,
+    progress_manager: Optional[ProgressManager] = None,
+    user_id: Optional[int] = None,
+    is_admin: bool = False,
+    file_id: Optional[str] = None,
+) -> IngestionResult:
+    """Internal implementation of process_document (runs under per-document lock)."""
     cfg = _apply_spreadsheet_ingestion_safeguards(
         coerce_ingestion_config(config),
         source_path,
diff --git a/src/xagent/core/tools/core/RAG_tools/storage/contracts.py b/src/xagent/core/tools/core/RAG_tools/storage/contracts.py
@@ -722,19 +722,31 @@ def replace_chunks(
         user_id: Optional[int] = None,
         is_admin: bool = False,
     ) -> None:
-        """Replace chunk records within a scope (delete old + insert new).
+        """Replace chunk records within a scope (insert new, then delete old).
 
-        Deletes all existing chunk rows matching *replace_scope* (and tenancy
-        filters), then inserts *records*. This guarantees that re-chunking with
-        different parameters does not leave stale rows from a previous
-        configuration (issue #199).
+        Inserts *records* first via idempotent merge_insert, then deletes rows
+        matching *replace_scope* that do not belong to the new generation.
+        This insert-before-delete order avoids data loss if the process crashes
+        between the two operations (worst case: brief duplicate data, not zero
+        data). Guarantees that re-chunking with different parameters does not
+        leave stale rows from a previous configuration (issue #199).
+
+        After updating the chunks table, cascade-deletes orphaned rows from
+        all ``embeddings_*`` tables whose chunk_id no longer belongs to the
+        new generation, preventing stale embeddings from appearing in search
+        results.
 
         Args:
-            records: New chunk records to insert after deletion.
-            replace_scope: Filter dict (e.g. collection, doc_id, parse_hash)
-                identifying rows to delete before inserting.
+            records: New chunk records to insert. Each record must contain a
+                ``chunk_id`` field.
+            replace_scope: Non-empty filter dict (e.g. collection, doc_id,
+                parse_hash) identifying the scope of rows to replace.
             user_id: Optional user ID for multi-tenancy scoped deletion.
             is_admin: Whether the caller can operate across tenants.
+
+        Raises:
+            ValueError: If *replace_scope* is empty or contains disallowed
+                keys, or if any record is missing ``chunk_id``.
         """
 
     @abstractmethod
diff --git a/src/xagent/core/tools/core/RAG_tools/storage/lancedb_stores.py b/src/xagent/core/tools/core/RAG_tools/storage/lancedb_stores.py
@@ -1460,26 +1460,42 @@ def replace_chunks(
         user_id: Optional[int] = None,
         is_admin: bool = False,
     ) -> None:
-        """Replace chunk records within a scope (delete old + insert new).
+        """Replace chunk records within a scope (insert new, then delete old).
 
-        Safety: inserts new records first, then deletes rows that do NOT belong
-        to the new generation. This avoids data loss if the process crashes
+        Inserts *records* first via idempotent merge_insert, then deletes rows
+        matching *replace_scope* that do not belong to the new generation.
+        This insert-before-delete order avoids data loss if the process crashes
         between the two operations (worst case: brief duplicate data, not zero
-        data).
+        data). Also cascade-deletes orphaned embedding rows from all
+        ``embeddings_*`` tables.
         """
         from ..LanceDB.schema_manager import _safe_close_table, ensure_chunks_table
 
+        if not replace_scope:
+            raise ValueError("replace_scope must not be empty")
+
         for key in replace_scope:
             if key not in self._REPLACE_CHUNKS_ALLOWED_KEYS:
                 raise ValueError(f"Invalid replace_scope column: {key}")
 
+        if records:
+            missing = [i for i, r in enumerate(records) if "chunk_id" not in r]
+            if missing:
+                raise ValueError(
+                    f"records at index {missing} missing required 'chunk_id' field"
+                )
+
         conn = self._get_connection()
         ensure_chunks_table(conn)
         table = conn.open_table("chunks")
         try:
-            # Step 1: Insert new records (safe — idempotent add)
+            # Step 1: Upsert new records (merge_insert is idempotent on retry)
             if records:
-                table.add(records)
+                table.merge_insert(
+                    ["collection", "doc_id", "parse_hash", "chunk_id"]
+                ).when_matched_update_all().when_not_matched_insert_all().execute(
+                    records
+                )
 
             # Step 2: Build delete filter targeting old generations
             scope_parts = [
@@ -1510,6 +1526,48 @@ def replace_chunks(
             _safe_close_table(table)
         self.invalidate_table_cache("chunks")
 
+        # Step 3: Cascade-delete orphaned embeddings for the same scope.
+        # After chunk replacement, old chunk_ids no longer exist in the chunks
+        # table but their embedding rows would still be searchable.
+        embed_scope_parts = [
+            f"{k} == '{escape_lancedb_string(str(v))}'"
+            for k, v in replace_scope.items()
+            if k in ("collection", "doc_id", "parse_hash")
+        ]
+        if not embed_scope_parts:
+            return
+
+        embed_base = " AND ".join(embed_scope_parts)
+        embed_user = UserPermissions.get_user_filter(user_id, is_admin)
+        if embed_base and embed_user:
+            embed_filter = f"({embed_base}) AND ({embed_user})"
+        elif embed_user:
+            embed_filter = embed_user
+        else:
+            embed_filter = embed_base
+
+        if records:
+            new_ids = [r["chunk_id"] for r in records]
+            id_list = ", ".join(f"'{escape_lancedb_string(cid)}'" for cid in new_ids)
+            embed_filter = f"({embed_filter}) AND chunk_id NOT IN ({id_list})"
+
+        for tname in self.list_table_names():
+            if not tname.startswith("embeddings_"):
+                continue
+            try:
+                etable = conn.open_table(tname)
+                try:
+                    etable.delete(embed_filter)
+                finally:
+                    _safe_close_table(etable)
+            except Exception:  # noqa: BLE001
+                logger.warning(
+                    "Failed to cascade-delete embeddings from %s",
+                    tname,
+                    exc_info=True,
+                )
+        self.invalidate_table_cache()
+
     def upsert_embeddings(self, model_tag: str, records: List[Dict[str, Any]]) -> None:
         """Upsert embedding records to LanceDB with fallback pattern.
 
diff --git a/tests/core/tools/core/RAG_tools/pipelines/test_document_ingestion.py b/tests/core/tools/core/RAG_tools/pipelines/test_document_ingestion.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import threading
+import time
 from typing import Dict, List, Union
 
 import pytest
@@ -972,3 +974,117 @@ def test_process_document_rejects_absolute_paths_outside_allowed_dir(
             or "permission" in message_lower
             or "denied" in message_lower
         ), f"Unexpected error message for {abs_path}: {result.message}"
+
+
+# ---------------------------------------------------------------------------
+# Ingestion lock tests (PR #202 comment 7)
+# ---------------------------------------------------------------------------
+
+
+def test_get_ingestion_lock_returns_same_instance() -> None:
+    """_get_ingestion_lock returns the same Lock for the same key."""
+    document_ingestion._ingestion_locks.clear()
+    lock_a = document_ingestion._get_ingestion_lock("col", "/tmp/a.pdf")
+    lock_b = document_ingestion._get_ingestion_lock("col", "/tmp/a.pdf")
+    assert lock_a is lock_b
+
+
+def test_get_ingestion_lock_different_keys() -> None:
+    """Different (collection, source_path) pairs get independent locks."""
+    document_ingestion._ingestion_locks.clear()
+    lock_a = document_ingestion._get_ingestion_lock("col", "/tmp/a.pdf")
+    lock_b = document_ingestion._get_ingestion_lock("col", "/tmp/b.pdf")
+    lock_c = document_ingestion._get_ingestion_lock("other", "/tmp/a.pdf")
+    assert lock_a is not lock_b
+    assert lock_a is not lock_c
+
+
+def test_ingestion_lock_serialises_same_document(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Two concurrent process_document calls for the same document run serially."""
+    document_ingestion._ingestion_locks.clear()
+
+    execution_log: List[str] = []
+    original_impl = document_ingestion._process_document_impl
+
+    def _slow_impl(*args: object, **kwargs: object) -> IngestionResult:
+        execution_log.append("enter")
+        # Without the lock, both threads would be here simultaneously
+        time.sleep(0.1)
+        execution_log.append("exit")
+        return original_impl(*args, **kwargs)
+
+    _patch_pipeline_dependencies(monkeypatch)
+    monkeypatch.setattr(document_ingestion, "_process_document_impl", _slow_impl)
+
+    results: List[IngestionResult] = [None, None]  # type: ignore[list-item]
+
+    def _run(idx: int) -> None:
+        results[idx] = document_ingestion.process_document(
+            collection="demo",
+            source_path="/tmp/doc.pdf",
+            config=IngestionConfig(),
+        )
+
+    t1 = threading.Thread(target=_run, args=(0,))
+    t2 = threading.Thread(target=_run, args=(1,))
+    t1.start()
+    t2.start()
+    t1.join(timeout=10)
+    t2.join(timeout=10)
+
+    # With serialisation the pattern must be [enter, exit, enter, exit],
+    # never [enter, enter, ...] which would indicate overlap.
+    assert len(execution_log) == 4
+    assert execution_log == ["enter", "exit", "enter", "exit"]
+
+
+def test_ingestion_lock_allows_different_documents_concurrently(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Calls for different documents are NOT serialised (they use independent locks)."""
+    document_ingestion._ingestion_locks.clear()
+
+    entered = threading.Event()
+    gate = threading.Event()
+
+    original_impl = document_ingestion._process_document_impl
+
+    def _blocking_impl(*args: object, **kwargs: object) -> IngestionResult:
+        entered.set()
+        gate.wait(timeout=5)
+        return original_impl(*args, **kwargs)
+
+    _patch_pipeline_dependencies(monkeypatch)
+    monkeypatch.setattr(document_ingestion, "_process_document_impl", _blocking_impl)
+
+    def _run_a() -> None:
+        document_ingestion.process_document(
+            collection="demo",
+            source_path="/tmp/a.pdf",
+            config=IngestionConfig(),
+        )
+
+    def _run_b() -> None:
+        document_ingestion.process_document(
+            collection="demo",
+            source_path="/tmp/b.pdf",
+            config=IngestionConfig(),
+        )
+
+    t1 = threading.Thread(target=_run_a)
+    t2 = threading.Thread(target=_run_b)
+    t1.start()
+    # Wait for thread 1 to enter _blocking_impl
+    assert entered.wait(timeout=5)
+    entered.clear()
+
+    t2.start()
+    # Thread 2 should also enter _blocking_impl (different document, no lock contention)
+    both_entered = entered.wait(timeout=2)
+    gate.set()  # Release both threads
+    t1.join(timeout=5)
+    t2.join(timeout=5)
+
+    assert both_entered, "Different documents should not block each other"
diff --git a/tests/core/tools/core/RAG_tools/storage/test_lancedb_stores.py b/tests/core/tools/core/RAG_tools/storage/test_lancedb_stores.py