|
| 1 | +from onyx.access.models import ExternalAccess |
| 2 | +from onyx.connectors.models import BasicExpertInfo |
| 3 | +from onyx.connectors.models import Document |
| 4 | +from onyx.connectors.models import DocumentSource |
| 5 | +from onyx.connectors.models import HierarchyNode |
| 6 | +from onyx.connectors.models import IndexAttemptMetadata |
| 7 | +from onyx.connectors.models import TextSection |
| 8 | +from onyx.db.enums import HierarchyNodeType |
| 9 | +from onyx.indexing import indexing_pipeline |
| 10 | +from onyx.indexing.postgres_sanitization import sanitize_document_for_postgres |
| 11 | +from onyx.indexing.postgres_sanitization import sanitize_hierarchy_node_for_postgres |
| 12 | + |
| 13 | + |
| 14 | +def test_sanitize_document_for_postgres_removes_nul_bytes() -> None: |
| 15 | + document = Document( |
| 16 | + id="doc\x00-id", |
| 17 | + source=DocumentSource.FILE, |
| 18 | + semantic_identifier="sem\x00-id", |
| 19 | + title="ti\x00tle", |
| 20 | + parent_hierarchy_raw_node_id="parent\x00-id", |
| 21 | + sections=[TextSection(link="lin\x00k", text="te\x00xt")], |
| 22 | + metadata={"ke\x00y": "va\x00lue", "list\x00key": ["a\x00", "b"]}, |
| 23 | + doc_metadata={ |
| 24 | + "j\x00son": { |
| 25 | + "in\x00ner": "va\x00l", |
| 26 | + "arr": ["x\x00", {"dee\x00p": "y\x00"}], |
| 27 | + } |
| 28 | + }, |
| 29 | + primary_owners=[BasicExpertInfo(display_name="Ali\x00ce", email="a\x00@x.com")], |
| 30 | + secondary_owners=[BasicExpertInfo(first_name="Bo\x00b", last_name="Sm\x00ith")], |
| 31 | + external_access=ExternalAccess( |
| 32 | + external_user_emails={"user\x00@example.com"}, |
| 33 | + external_user_group_ids={"gro\x00up-1"}, |
| 34 | + is_public=False, |
| 35 | + ), |
| 36 | + ) |
| 37 | + |
| 38 | + sanitized = sanitize_document_for_postgres(document) |
| 39 | + |
| 40 | + assert sanitized.id == "doc-id" |
| 41 | + assert sanitized.semantic_identifier == "sem-id" |
| 42 | + assert sanitized.title == "title" |
| 43 | + assert sanitized.parent_hierarchy_raw_node_id == "parent-id" |
| 44 | + assert sanitized.sections[0].link == "link" |
| 45 | + assert sanitized.sections[0].text == "text" |
| 46 | + assert sanitized.metadata == {"key": "value", "listkey": ["a", "b"]} |
| 47 | + assert sanitized.doc_metadata == { |
| 48 | + "json": {"inner": "val", "arr": ["x", {"deep": "y"}]} |
| 49 | + } |
| 50 | + assert sanitized.primary_owners is not None |
| 51 | + assert sanitized.primary_owners[0].display_name == "Alice" |
| 52 | + assert sanitized.primary_owners[0].email == "a@x.com" |
| 53 | + assert sanitized.secondary_owners is not None |
| 54 | + assert sanitized.secondary_owners[0].first_name == "Bob" |
| 55 | + assert sanitized.secondary_owners[0].last_name == "Smith" |
| 56 | + assert sanitized.external_access is not None |
| 57 | + assert sanitized.external_access.external_user_emails == {"user@example.com"} |
| 58 | + assert sanitized.external_access.external_user_group_ids == {"group-1"} |
| 59 | + |
| 60 | + # Ensure original document is not mutated |
| 61 | + assert document.id == "doc\x00-id" |
| 62 | + assert document.metadata == {"ke\x00y": "va\x00lue", "list\x00key": ["a\x00", "b"]} |
| 63 | + |
| 64 | + |
| 65 | +def test_sanitize_hierarchy_node_for_postgres_removes_nul_bytes() -> None: |
| 66 | + node = HierarchyNode( |
| 67 | + raw_node_id="raw\x00-id", |
| 68 | + raw_parent_id="paren\x00t-id", |
| 69 | + display_name="fol\x00der", |
| 70 | + link="https://exa\x00mple.com", |
| 71 | + node_type=HierarchyNodeType.FOLDER, |
| 72 | + external_access=ExternalAccess( |
| 73 | + external_user_emails={"a\x00@example.com"}, |
| 74 | + external_user_group_ids={"g\x00-1"}, |
| 75 | + is_public=True, |
| 76 | + ), |
| 77 | + ) |
| 78 | + |
| 79 | + sanitized = sanitize_hierarchy_node_for_postgres(node) |
| 80 | + |
| 81 | + assert sanitized.raw_node_id == "raw-id" |
| 82 | + assert sanitized.raw_parent_id == "parent-id" |
| 83 | + assert sanitized.display_name == "folder" |
| 84 | + assert sanitized.link == "https://example.com" |
| 85 | + assert sanitized.external_access is not None |
| 86 | + assert sanitized.external_access.external_user_emails == {"a@example.com"} |
| 87 | + assert sanitized.external_access.external_user_group_ids == {"g-1"} |
| 88 | + |
| 89 | + |
| 90 | +def test_index_doc_batch_prepare_sanitizes_before_db_ops(monkeypatch: object) -> None: |
| 91 | + document = Document( |
| 92 | + id="doc\x00id", |
| 93 | + source=DocumentSource.FILE, |
| 94 | + semantic_identifier="sem\x00id", |
| 95 | + sections=[TextSection(text="content", link="li\x00nk")], |
| 96 | + metadata={"ke\x00y": "va\x00lue"}, |
| 97 | + ) |
| 98 | + |
| 99 | + captured: dict[str, object] = {} |
| 100 | + |
| 101 | + def _get_documents_by_ids(db_session: object, document_ids: list[str]) -> list: |
| 102 | + _ = db_session, document_ids |
| 103 | + return [] |
| 104 | + |
| 105 | + monkeypatch.setattr( |
| 106 | + indexing_pipeline, "get_documents_by_ids", _get_documents_by_ids |
| 107 | + ) |
| 108 | + |
| 109 | + def _capture_upsert_documents_in_db(**kwargs: object) -> None: |
| 110 | + captured["upsert_documents"] = kwargs["documents"] |
| 111 | + |
| 112 | + monkeypatch.setattr( |
| 113 | + indexing_pipeline, "_upsert_documents_in_db", _capture_upsert_documents_in_db |
| 114 | + ) |
| 115 | + |
| 116 | + def _capture_doc_cc_pair(*args: object) -> None: |
| 117 | + captured["cc_pair_doc_ids"] = args[3] |
| 118 | + |
| 119 | + monkeypatch.setattr( |
| 120 | + indexing_pipeline, |
| 121 | + "upsert_document_by_connector_credential_pair", |
| 122 | + _capture_doc_cc_pair, |
| 123 | + ) |
| 124 | + |
| 125 | + def _noop_link_hierarchy_nodes_to_documents( |
| 126 | + db_session: object, |
| 127 | + document_ids: list[str], |
| 128 | + source: DocumentSource, |
| 129 | + commit: bool, |
| 130 | + ) -> int: |
| 131 | + _ = db_session, document_ids, source, commit |
| 132 | + return 0 |
| 133 | + |
| 134 | + monkeypatch.setattr( |
| 135 | + indexing_pipeline, |
| 136 | + "link_hierarchy_nodes_to_documents", |
| 137 | + _noop_link_hierarchy_nodes_to_documents, |
| 138 | + ) |
| 139 | + |
| 140 | + context = indexing_pipeline.index_doc_batch_prepare( |
| 141 | + documents=[document], |
| 142 | + index_attempt_metadata=IndexAttemptMetadata(connector_id=1, credential_id=2), |
| 143 | + db_session=object(), # type: ignore[arg-type] |
| 144 | + ignore_time_skip=True, |
| 145 | + ) |
| 146 | + |
| 147 | + assert context is not None |
| 148 | + assert context.updatable_docs[0].id == "docid" |
| 149 | + assert context.updatable_docs[0].semantic_identifier == "semid" |
| 150 | + assert context.updatable_docs[0].metadata == {"key": "value"} |
| 151 | + assert captured["cc_pair_doc_ids"] == ["docid"] |
| 152 | + |
| 153 | + upsert_documents = captured["upsert_documents"] |
| 154 | + assert isinstance(upsert_documents, list) |
| 155 | + assert upsert_documents[0].id == "docid" |
0 commit comments