|
1 | 1 | """ |
2 | | -Integration test for document vector operations |
| 2 | +Integration test for document vector operations. |
3 | 3 |
|
4 | | -This test demonstrates the complete workflow from ES retrieval to clustering. |
5 | | -Note: This requires a running Elasticsearch instance. |
| 4 | +This module validates the embedding and clustering workflow using deterministic |
| 5 | +fixtures so the clustering assertions stay stable across environments. |
6 | 6 | """ |
7 | 7 | import os |
8 | 8 | import sys |
|
80 | 80 |
|
81 | 81 |
|
82 | 82 | class TestDocumentVectorIntegration: |
83 | | - """Integration tests for document vector operations""" |
84 | | - |
| 83 | + """Integration tests for document vector operations.""" |
| 84 | + |
85 | 85 | def test_complete_workflow(self): |
86 | | - """Test complete workflow: embedding calculation -> clustering""" |
87 | | - # Simulate document chunks with embeddings |
| 86 | + """Test complete workflow: embedding calculation -> clustering.""" |
88 | 87 | chunks_1 = [ |
89 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 1 chunk 1'}, |
90 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 1 chunk 2'}, |
91 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 1 chunk 3'} |
| 88 | + {"embedding": [1.0, 0.0], "content": "Document one chunk A"}, |
| 89 | + {"embedding": [0.9, 0.1], "content": "Document one chunk B"}, |
| 90 | + {"embedding": [0.95, 0.05], "content": "Document one chunk C"}, |
92 | 91 | ] |
93 | | - |
94 | 92 | chunks_2 = [ |
95 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 2 chunk 1'}, |
96 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 2 chunk 2'} |
| 93 | + {"embedding": [0.0, 1.0], "content": "Document two chunk A"}, |
| 94 | + {"embedding": [0.1, 0.9], "content": "Document two chunk B"}, |
97 | 95 | ] |
98 | | - |
99 | 96 | chunks_3 = [ |
100 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 3 chunk 1'}, |
101 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 3 chunk 2'}, |
102 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 3 chunk 3'}, |
103 | | - {'embedding': np.random.rand(128).tolist(), 'content': 'Content for doc 3 chunk 4'} |
| 97 | + {"embedding": [0.85, 0.15], "content": "Document three chunk A"}, |
| 98 | + {"embedding": [0.8, 0.2], "content": "Document three chunk B"}, |
| 99 | + {"embedding": [0.88, 0.12], "content": "Document three chunk C"}, |
| 100 | + {"embedding": [0.83, 0.17], "content": "Document three chunk D"}, |
104 | 101 | ] |
105 | | - |
106 | | - # Calculate document embeddings |
| 102 | + |
107 | 103 | doc_embedding_1 = calculate_document_embedding(chunks_1, use_weighted=True) |
108 | 104 | doc_embedding_2 = calculate_document_embedding(chunks_2, use_weighted=True) |
109 | 105 | doc_embedding_3 = calculate_document_embedding(chunks_3, use_weighted=True) |
110 | | - |
| 106 | + |
111 | 107 | assert doc_embedding_1 is not None |
112 | 108 | assert doc_embedding_2 is not None |
113 | 109 | assert doc_embedding_3 is not None |
114 | | - |
115 | | - # Create document embeddings dictionary |
| 110 | + |
116 | 111 | doc_embeddings = { |
117 | | - 'doc_001': doc_embedding_1, |
118 | | - 'doc_002': doc_embedding_2, |
119 | | - 'doc_003': doc_embedding_3 |
| 112 | + "doc_001": doc_embedding_1, |
| 113 | + "doc_002": doc_embedding_2, |
| 114 | + "doc_003": doc_embedding_3, |
120 | 115 | } |
121 | | - |
122 | | - # Determine optimal K |
| 116 | + |
123 | 117 | embeddings_array = np.array([doc_embedding_1, doc_embedding_2, doc_embedding_3]) |
124 | 118 | optimal_k = auto_determine_k(embeddings_array, min_k=2, max_k=3) |
125 | | - |
126 | | - assert 2 <= optimal_k <= 3 |
127 | | - |
128 | | - # Perform clustering |
| 119 | + |
| 120 | + assert optimal_k == 2 |
| 121 | + |
129 | 122 | clusters = kmeans_cluster_documents(doc_embeddings, k=optimal_k) |
130 | | - |
| 123 | + |
131 | 124 | assert len(clusters) == optimal_k |
132 | 125 | assert sum(len(docs) for docs in clusters.values()) == 3 |
133 | | - |
| 126 | + assert sorted(len(docs) for docs in clusters.values()) == [1, 2] |
| 127 | + |
| 128 | + cluster_sets = [set(docs) for docs in clusters.values()] |
| 129 | + assert {"doc_001", "doc_003"} in cluster_sets |
| 130 | + assert {"doc_002"} in cluster_sets |
| 131 | + |
134 | 132 | def test_large_dataset_clustering(self): |
135 | | - """Test clustering with larger simulated dataset""" |
136 | | - # Create simulated document embeddings |
137 | | - n_docs = 50 |
138 | | - doc_embeddings = { |
139 | | - f'doc_{i:03d}': np.random.rand(128) for i in range(n_docs) |
| 133 | + """Test clustering with a deterministic larger simulated dataset.""" |
| 134 | + cluster_a = { |
| 135 | + f"doc_a_{i:03d}": np.array([1.0 + i * 0.002, 1.0 + i * 0.001, 0.2]) |
| 136 | + for i in range(20) |
| 137 | + } |
| 138 | + cluster_b = { |
| 139 | + f"doc_b_{i:03d}": np.array([5.0 + i * 0.002, 5.0 + i * 0.001, 0.4]) |
| 140 | + for i in range(15) |
| 141 | + } |
| 142 | + cluster_c = { |
| 143 | + f"doc_c_{i:03d}": np.array([9.0 + i * 0.002, 1.0 + i * 0.001, 0.6]) |
| 144 | + for i in range(15) |
140 | 145 | } |
141 | | - |
142 | | - # Auto-determine K |
| 146 | + doc_embeddings = {**cluster_a, **cluster_b, **cluster_c} |
| 147 | + n_docs = len(doc_embeddings) |
| 148 | + |
143 | 149 | embeddings_array = np.array(list(doc_embeddings.values())) |
144 | | - optimal_k = auto_determine_k(embeddings_array, min_k=3, max_k=15) |
145 | | - |
146 | | - assert 3 <= optimal_k <= 15 |
147 | | - |
148 | | - # Cluster documents |
149 | | - clusters = kmeans_cluster_documents(doc_embeddings, k=optimal_k) |
150 | | - |
151 | | - assert len(clusters) == optimal_k |
| 150 | + optimal_k = auto_determine_k(embeddings_array, min_k=3, max_k=6) |
| 151 | + |
| 152 | + assert 3 <= optimal_k <= 6 |
| 153 | + |
| 154 | + clusters = kmeans_cluster_documents(doc_embeddings, k=3) |
| 155 | + |
| 156 | + assert len(clusters) == 3 |
152 | 157 | assert sum(len(docs) for docs in clusters.values()) == n_docs |
153 | | - |
154 | | - # Verify cluster sizes are reasonable |
155 | | - cluster_sizes = [len(docs) for docs in clusters.values()] |
156 | | - assert min(cluster_sizes) >= 1 |
157 | | - # Allow for some imbalance in clustering results (realistic for random data) |
158 | | - assert max(cluster_sizes) <= n_docs * 0.7 # No single cluster dominates too much |
| 158 | + |
| 159 | + cluster_sizes = sorted(len(docs) for docs in clusters.values()) |
| 160 | + assert cluster_sizes == [15, 15, 20] |
159 | 161 |
|
160 | 162 |
|
161 | 163 | if __name__ == '__main__': |
|
0 commit comments