fix(sanitization): Centralizing DB Filters

justin-tahara · justin-tahara · commit def7aa830f77 · 2026-02-24T12:09:38.000-08:00
diff --git a/backend/onyx/background/indexing/run_docfetching.py b/backend/onyx/background/indexing/run_docfetching.py
@@ -58,6 +58,8 @@
 from onyx.file_store.document_batch_storage import get_document_batch_storage
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
+from onyx.indexing.postgres_sanitization import sanitize_document_for_postgres
+from onyx.indexing.postgres_sanitization import sanitize_hierarchy_nodes_for_postgres
 from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
 from onyx.redis.redis_hierarchy import ensure_source_node_exists
 from onyx.redis.redis_hierarchy import get_node_id_from_raw_id
@@ -156,36 +158,7 @@ def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
             logger.warning(
                 f"doc {doc.id} too large, Document size: {sys.getsizeof(doc)}"
             )
-        cleaned_doc = doc.model_copy()
-
-        # Postgres cannot handle NUL characters in text fields
-        if "\x00" in cleaned_doc.id:
-            logger.warning(f"NUL characters found in document ID: {cleaned_doc.id}")
-            cleaned_doc.id = cleaned_doc.id.replace("\x00", "")
-
-        if cleaned_doc.title and "\x00" in cleaned_doc.title:
-            logger.warning(
-                f"NUL characters found in document title: {cleaned_doc.title}"
-            )
-            cleaned_doc.title = cleaned_doc.title.replace("\x00", "")
-
-        if "\x00" in cleaned_doc.semantic_identifier:
-            logger.warning(
-                f"NUL characters found in document semantic identifier: {cleaned_doc.semantic_identifier}"
-            )
-            cleaned_doc.semantic_identifier = cleaned_doc.semantic_identifier.replace(
-                "\x00", ""
-            )
-
-        for section in cleaned_doc.sections:
-            if section.link is not None:
-                section.link = section.link.replace("\x00", "")
-
-            # since text can be longer, just replace to avoid double scan
-            if isinstance(section, TextSection) and section.text is not None:
-                section.text = section.text.replace("\x00", "")
-
-        cleaned_batch.append(cleaned_doc)
+        cleaned_batch.append(sanitize_document_for_postgres(doc))
 
     return cleaned_batch
 
@@ -602,10 +575,13 @@ def connector_document_extraction(
 
                 # Process hierarchy nodes batch - upsert to Postgres and cache in Redis
                 if hierarchy_node_batch:
+                    hierarchy_node_batch_cleaned = (
+                        sanitize_hierarchy_nodes_for_postgres(hierarchy_node_batch)
+                    )
                     with get_session_with_current_tenant() as db_session:
                         upserted_nodes = upsert_hierarchy_nodes_batch(
                             db_session=db_session,
-                            nodes=hierarchy_node_batch,
+                            nodes=hierarchy_node_batch_cleaned,
                             source=db_connector.source,
                             commit=True,
                             is_connector_public=is_connector_public,
@@ -624,7 +600,7 @@ def connector_document_extraction(
                         )
 
                     logger.debug(
-                        f"Persisted and cached {len(hierarchy_node_batch)} hierarchy nodes "
+                        f"Persisted and cached {len(hierarchy_node_batch_cleaned)} hierarchy nodes "
                         f"for attempt={index_attempt_id}"
                     )
 
diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py
@@ -49,6 +49,7 @@
 from onyx.indexing.models import DocAwareChunk
 from onyx.indexing.models import IndexingBatchAdapter
 from onyx.indexing.models import UpdatableChunkData
+from onyx.indexing.postgres_sanitization import sanitize_documents_for_postgres
 from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
 from onyx.llm.factory import get_default_llm_with_vision
 from onyx.llm.factory import get_llm_for_contextual_rag
@@ -228,6 +229,8 @@ def index_doc_batch_prepare(
 ) -> DocumentBatchPrepareContext | None:
     """Sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
     This preceeds indexing it into the actual document index."""
+    documents = sanitize_documents_for_postgres(documents)
+
     # Create a trimmed list of docs that don't have a newer updated at
     # Shortcuts the time-consuming flow on connector index retries
     document_ids: list[str] = [document.id for document in documents]
diff --git a/backend/onyx/indexing/postgres_sanitization.py b/backend/onyx/indexing/postgres_sanitization.py
@@ -0,0 +1,150 @@
+from typing import Any
+
+from onyx.access.models import ExternalAccess
+from onyx.connectors.models import BasicExpertInfo
+from onyx.connectors.models import Document
+from onyx.connectors.models import HierarchyNode
+
+
+def _sanitize_string(value: str) -> str:
+    return value.replace("\x00", "")
+
+
+def _sanitize_json_like(value: Any) -> Any:
+    if isinstance(value, str):
+        return _sanitize_string(value)
+
+    if isinstance(value, list):
+        return [_sanitize_json_like(item) for item in value]
+
+    if isinstance(value, tuple):
+        return tuple(_sanitize_json_like(item) for item in value)
+
+    if isinstance(value, dict):
+        sanitized: dict[Any, Any] = {}
+        for key, nested_value in value.items():
+            cleaned_key = _sanitize_string(key) if isinstance(key, str) else key
+            sanitized[cleaned_key] = _sanitize_json_like(nested_value)
+        return sanitized
+
+    return value
+
+
+def _sanitize_expert_info(expert: BasicExpertInfo) -> BasicExpertInfo:
+    return expert.model_copy(
+        update={
+            "display_name": (
+                _sanitize_string(expert.display_name)
+                if expert.display_name is not None
+                else None
+            ),
+            "first_name": (
+                _sanitize_string(expert.first_name)
+                if expert.first_name is not None
+                else None
+            ),
+            "middle_initial": (
+                _sanitize_string(expert.middle_initial)
+                if expert.middle_initial is not None
+                else None
+            ),
+            "last_name": (
+                _sanitize_string(expert.last_name)
+                if expert.last_name is not None
+                else None
+            ),
+            "email": (
+                _sanitize_string(expert.email) if expert.email is not None else None
+            ),
+        }
+    )
+
+
+def _sanitize_external_access(external_access: ExternalAccess) -> ExternalAccess:
+    return ExternalAccess(
+        external_user_emails={
+            _sanitize_string(email) for email in external_access.external_user_emails
+        },
+        external_user_group_ids={
+            _sanitize_string(group_id)
+            for group_id in external_access.external_user_group_ids
+        },
+        is_public=external_access.is_public,
+    )
+
+
+def sanitize_document_for_postgres(document: Document) -> Document:
+    cleaned_doc = document.model_copy(deep=True)
+
+    cleaned_doc.id = _sanitize_string(cleaned_doc.id)
+    cleaned_doc.semantic_identifier = _sanitize_string(cleaned_doc.semantic_identifier)
+    if cleaned_doc.title is not None:
+        cleaned_doc.title = _sanitize_string(cleaned_doc.title)
+    if cleaned_doc.parent_hierarchy_raw_node_id is not None:
+        cleaned_doc.parent_hierarchy_raw_node_id = _sanitize_string(
+            cleaned_doc.parent_hierarchy_raw_node_id
+        )
+
+    cleaned_doc.metadata = {
+        _sanitize_string(key): (
+            [_sanitize_string(item) for item in value]
+            if isinstance(value, list)
+            else _sanitize_string(value)
+        )
+        for key, value in cleaned_doc.metadata.items()
+    }
+
+    if cleaned_doc.doc_metadata is not None:
+        cleaned_doc.doc_metadata = _sanitize_json_like(cleaned_doc.doc_metadata)
+
+    if cleaned_doc.primary_owners is not None:
+        cleaned_doc.primary_owners = [
+            _sanitize_expert_info(expert) for expert in cleaned_doc.primary_owners
+        ]
+    if cleaned_doc.secondary_owners is not None:
+        cleaned_doc.secondary_owners = [
+            _sanitize_expert_info(expert) for expert in cleaned_doc.secondary_owners
+        ]
+
+    if cleaned_doc.external_access is not None:
+        cleaned_doc.external_access = _sanitize_external_access(
+            cleaned_doc.external_access
+        )
+
+    for section in cleaned_doc.sections:
+        if section.link is not None:
+            section.link = _sanitize_string(section.link)
+        if section.text is not None:
+            section.text = _sanitize_string(section.text)
+        if section.image_file_id is not None:
+            section.image_file_id = _sanitize_string(section.image_file_id)
+
+    return cleaned_doc
+
+
+def sanitize_documents_for_postgres(documents: list[Document]) -> list[Document]:
+    return [sanitize_document_for_postgres(document) for document in documents]
+
+
+def sanitize_hierarchy_node_for_postgres(node: HierarchyNode) -> HierarchyNode:
+    cleaned_node = node.model_copy(deep=True)
+
+    cleaned_node.raw_node_id = _sanitize_string(cleaned_node.raw_node_id)
+    cleaned_node.display_name = _sanitize_string(cleaned_node.display_name)
+    if cleaned_node.raw_parent_id is not None:
+        cleaned_node.raw_parent_id = _sanitize_string(cleaned_node.raw_parent_id)
+    if cleaned_node.link is not None:
+        cleaned_node.link = _sanitize_string(cleaned_node.link)
+
+    if cleaned_node.external_access is not None:
+        cleaned_node.external_access = _sanitize_external_access(
+            cleaned_node.external_access
+        )
+
+    return cleaned_node
+
+
+def sanitize_hierarchy_nodes_for_postgres(
+    nodes: list[HierarchyNode],
+) -> list[HierarchyNode]:
+    return [sanitize_hierarchy_node_for_postgres(node) for node in nodes]
diff --git a/backend/tests/unit/onyx/indexing/test_postgres_sanitization.py b/backend/tests/unit/onyx/indexing/test_postgres_sanitization.py
@@ -0,0 +1,155 @@
+from onyx.access.models import ExternalAccess
+from onyx.connectors.models import BasicExpertInfo
+from onyx.connectors.models import Document
+from onyx.connectors.models import DocumentSource
+from onyx.connectors.models import HierarchyNode
+from onyx.connectors.models import IndexAttemptMetadata
+from onyx.connectors.models import TextSection
+from onyx.db.enums import HierarchyNodeType
+from onyx.indexing import indexing_pipeline
+from onyx.indexing.postgres_sanitization import sanitize_document_for_postgres
+from onyx.indexing.postgres_sanitization import sanitize_hierarchy_node_for_postgres
+
+
+def test_sanitize_document_for_postgres_removes_nul_bytes() -> None:
+    document = Document(
+        id="doc\x00-id",
+        source=DocumentSource.FILE,
+        semantic_identifier="sem\x00-id",
+        title="ti\x00tle",
+        parent_hierarchy_raw_node_id="parent\x00-id",
+        sections=[TextSection(link="lin\x00k", text="te\x00xt")],
+        metadata={"ke\x00y": "va\x00lue", "list\x00key": ["a\x00", "b"]},
+        doc_metadata={
+            "j\x00son": {
+                "in\x00ner": "va\x00l",
+                "arr": ["x\x00", {"dee\x00p": "y\x00"}],
+            }
+        },
+        primary_owners=[BasicExpertInfo(display_name="Ali\x00ce", email="a\x00@x.com")],
+        secondary_owners=[BasicExpertInfo(first_name="Bo\x00b", last_name="Sm\x00ith")],
+        external_access=ExternalAccess(
+            external_user_emails={"user\x00@example.com"},
+            external_user_group_ids={"gro\x00up-1"},
+            is_public=False,
+        ),
+    )
+
+    sanitized = sanitize_document_for_postgres(document)
+
+    assert sanitized.id == "doc-id"
+    assert sanitized.semantic_identifier == "sem-id"
+    assert sanitized.title == "title"
+    assert sanitized.parent_hierarchy_raw_node_id == "parent-id"
+    assert sanitized.sections[0].link == "link"
+    assert sanitized.sections[0].text == "text"
+    assert sanitized.metadata == {"key": "value", "listkey": ["a", "b"]}
+    assert sanitized.doc_metadata == {
+        "json": {"inner": "val", "arr": ["x", {"deep": "y"}]}
+    }
+    assert sanitized.primary_owners is not None
+    assert sanitized.primary_owners[0].display_name == "Alice"
+    assert sanitized.primary_owners[0].email == "a@x.com"
+    assert sanitized.secondary_owners is not None
+    assert sanitized.secondary_owners[0].first_name == "Bob"
+    assert sanitized.secondary_owners[0].last_name == "Smith"
+    assert sanitized.external_access is not None
+    assert sanitized.external_access.external_user_emails == {"user@example.com"}
+    assert sanitized.external_access.external_user_group_ids == {"group-1"}
+
+    # Ensure original document is not mutated
+    assert document.id == "doc\x00-id"
+    assert document.metadata == {"ke\x00y": "va\x00lue", "list\x00key": ["a\x00", "b"]}
+
+
+def test_sanitize_hierarchy_node_for_postgres_removes_nul_bytes() -> None:
+    node = HierarchyNode(
+        raw_node_id="raw\x00-id",
+        raw_parent_id="paren\x00t-id",
+        display_name="fol\x00der",
+        link="https://exa\x00mple.com",
+        node_type=HierarchyNodeType.FOLDER,
+        external_access=ExternalAccess(
+            external_user_emails={"a\x00@example.com"},
+            external_user_group_ids={"g\x00-1"},
+            is_public=True,
+        ),
+    )
+
+    sanitized = sanitize_hierarchy_node_for_postgres(node)
+
+    assert sanitized.raw_node_id == "raw-id"
+    assert sanitized.raw_parent_id == "parent-id"
+    assert sanitized.display_name == "folder"
+    assert sanitized.link == "https://example.com"
+    assert sanitized.external_access is not None
+    assert sanitized.external_access.external_user_emails == {"a@example.com"}
+    assert sanitized.external_access.external_user_group_ids == {"g-1"}
+
+
+def test_index_doc_batch_prepare_sanitizes_before_db_ops(monkeypatch: object) -> None:
+    document = Document(
+        id="doc\x00id",
+        source=DocumentSource.FILE,
+        semantic_identifier="sem\x00id",
+        sections=[TextSection(text="content", link="li\x00nk")],
+        metadata={"ke\x00y": "va\x00lue"},
+    )
+
+    captured: dict[str, object] = {}
+
+    def _get_documents_by_ids(db_session: object, document_ids: list[str]) -> list:
+        _ = db_session, document_ids
+        return []
+
+    monkeypatch.setattr(
+        indexing_pipeline, "get_documents_by_ids", _get_documents_by_ids
+    )
+
+    def _capture_upsert_documents_in_db(**kwargs: object) -> None:
+        captured["upsert_documents"] = kwargs["documents"]
+
+    monkeypatch.setattr(
+        indexing_pipeline, "_upsert_documents_in_db", _capture_upsert_documents_in_db
+    )
+
+    def _capture_doc_cc_pair(*args: object) -> None:
+        captured["cc_pair_doc_ids"] = args[3]
+
+    monkeypatch.setattr(
+        indexing_pipeline,
+        "upsert_document_by_connector_credential_pair",
+        _capture_doc_cc_pair,
+    )
+
+    def _noop_link_hierarchy_nodes_to_documents(
+        db_session: object,
+        document_ids: list[str],
+        source: DocumentSource,
+        commit: bool,
+    ) -> int:
+        _ = db_session, document_ids, source, commit
+        return 0
+
+    monkeypatch.setattr(
+        indexing_pipeline,
+        "link_hierarchy_nodes_to_documents",
+        _noop_link_hierarchy_nodes_to_documents,
+    )
+
+    context = indexing_pipeline.index_doc_batch_prepare(
+        documents=[document],
+        index_attempt_metadata=IndexAttemptMetadata(connector_id=1, credential_id=2),
+        db_session=object(),  # type: ignore[arg-type]
+        ignore_time_skip=True,
+    )
+
+    assert context is not None
+    assert context.updatable_docs[0].id == "docid"
+    assert context.updatable_docs[0].semantic_identifier == "semid"
+    assert context.updatable_docs[0].metadata == {"key": "value"}
+    assert captured["cc_pair_doc_ids"] == ["docid"]
+
+    upsert_documents = captured["upsert_documents"]
+    assert isinstance(upsert_documents, list)
+    assert upsert_documents[0].id == "docid"