1313from qdrant_client .models import PointStruct , Filter , FieldCondition , MatchValue , SparseVector , VectorParams , MultiVectorConfig , MultiVectorComparator , Distance
1414
1515
16- from . import loaders
16+ from . import json_loader
1717from .embeddings import create_embeddings
1818
1919
@@ -29,11 +29,12 @@ def get_chunks_metadata(chunks: list[Document], item: str) -> Iterator[str]:
2929 logging .warning ('%d out of %d chunks are missing "%s" in metadata; using empty string' , missing , len (chunks ), item )
3030
3131
32- def point_exists (qdrant : Qdrant , collection_name : str , path : str , chunk_hash :str ) -> bool :
32+ def point_exists (qdrant : Qdrant , collection_name : str , loader_id : str , path : str , content_hash :str ) -> bool :
3333 filter = Filter (
3434 must = [
35+ FieldCondition (key = "loader_id" , match = MatchValue (value = loader_id )),
3536 FieldCondition (key = "path" , match = MatchValue (value = path )),
36- FieldCondition (key = "content_hash" , match = MatchValue (value = chunk_hash )),
37+ FieldCondition (key = "content_hash" , match = MatchValue (value = content_hash )),
3738 ]
3839 )
3940 result , _ = qdrant .client .scroll (
@@ -96,27 +97,23 @@ def payload(sample: dict[str, Any]) -> dict[str, str]:
9697 return {
9798 "content" : sample ["page_content" ],
9899 "path" : sample ["metadata" ]["source" ],
99- "content_hash" : sample ["chunk_hash " ],
100+ "content_hash" : sample ["metadata" ][ "content_hash " ],
100101 "title" : sample ["metadata" ].get ("title" ,"" ),
101102 "uri" : sample ["metadata" ].get ("uri" ,"" ),
102103 "loader_id" : sample ["metadata" ]["loader_id" ],
103104 "document_id" : sample ["metadata" ].get ("document_id" , "" )
104105 }
105106
106107def index (user_config : dict [str , Any ], opt_config : dict [str , Any ]) -> None :
107- # TODO: enable list of file paths in loader and adapt user_config
108- # Load the documents from pdf
109- # all_documents = loaders.sync_pdf_loader(user_config["file_path"])
110- # TODO: use ifdt loader to load pdf in json, then:
111108 logging .info ('Loading documents' )
112- all_documents = loaders .json_loader (user_config ['imported_documents_file_path' ])
109+ all_documents = json_loader .json_loader (user_config ['imported_documents_file_path' ])
113110
114111 # Split documents into chunks
115112 logging .info ('Splitting documents into chunks' )
116113 text_splitter = RecursiveCharacterTextSplitter (
117114 chunk_size = opt_config ["chunk_size" ], chunk_overlap = opt_config ["chunk_overlap" ]
118115 )
119- chunks = text_splitter .split_documents (all_documents )
116+ chunks = text_splitter .split_documents (all_documents )[: 2 ]
120117
121118 collection_name = user_config ["collection_name" ]
122119
@@ -126,7 +123,6 @@ def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None:
126123 opt_config = opt_config
127124 )
128125
129-
130126 chunks_content = [chunk .page_content for chunk in chunks ]
131127 if len (opt_config ["multi_search" ]) > 0 and opt_config ["query_mode" ] == "multi" :
132128 chunks_metadata = {}
@@ -140,8 +136,6 @@ def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None:
140136 else :
141137 raise TypeError (f"dense_vecs must be np.ndarray, got { type (dense_vecs )} " )
142138
143- # TODO: hash if you want to monitore changes in metadata
144- chunk_hash = [hashlib .md5 (chunk .page_content .encode ()).hexdigest () for chunk in chunks ]
145139 # Todo: handle different vector lengths for batch encoding when using sparse vectors
146140
147141 logging .info ('Creating embeddings...' )
@@ -158,40 +152,38 @@ def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None:
158152 if isinstance (embeddings , dict ) and "dense_vecs" in embeddings :
159153 if opt_config ["search_mode" ] == "dense" :
160154 chunks_with_embeddings = [
161- dict (chunk ) | {"dense_vec" : dense , "chunk_hash" : c_hash }
162- for chunk , dense , c_hash in zip (chunks , embeddings ["dense_vecs" ], chunk_hash )
155+ dict (chunk ) | {"dense_vec" : dense }
156+ for chunk , dense in zip (chunks , embeddings ["dense_vecs" ])
163157 ]
164158 if opt_config ["search_mode" ] == "dense_sparse" :
165159 chunks_with_embeddings = [
166160 dict (chunk )
167- | {"dense_vec" : dense , "lexical_weights" : sparse , "chunk_hash" : c_hash }
168- for chunk , dense , sparse , c_hash in zip (
161+ | {"dense_vec" : dense , "lexical_weights" : sparse }
162+ for chunk , dense , sparse in zip (
169163 chunks ,
170164 list (embeddings ["dense_vecs" ]),
171165 list (embeddings ["lexical_weights" ]),
172- chunk_hash ,
173166 )
174167 ]
175168 if opt_config ["search_mode" ] == "dense_sparse_colbert" :
176169 chunks_with_embeddings = [
177170 dict (chunk )
178- | {"dense_vec" : dense , "lexical_weights" : sparse , "colbert_vecs" : colbert , "chunk_hash" : c_hash }
179- for chunk , dense , sparse , colbert , c_hash in zip (
171+ | {"dense_vec" : dense , "lexical_weights" : sparse , "colbert_vecs" : colbert }
172+ for chunk , dense , sparse , colbert in zip (
180173 chunks ,
181174 list (embeddings ["dense_vecs" ]),
182175 list (embeddings ["lexical_weights" ]),
183176 list (embeddings ['colbert_vecs' ]),
184- chunk_hash ,
185177 )
186178 ]
187179 else :
188180 chunks_with_embeddings = [
189- dict (chunk ) | {"dense_vec" : dense , "chunk_hash" : c_hash }
190- for chunk , dense , c_hash in zip (chunks , embeddings , chunk_hash )
181+ dict (chunk ) | {"dense_vec" : dense }
182+ for chunk , dense in zip (chunks , embeddings )
191183 ]
192184
193185 for sample in chunks_with_embeddings :
194- if not point_exists (qdrant , collection_name , sample ['metadata' ]['source' ], sample ['chunk_hash ' ]):
186+ if not point_exists (qdrant , collection_name , sample ['metadata' ]['loader_id' ], sample [ 'metadata' ][ ' source' ], sample ['metadata' ][ 'content_hash ' ]):
195187 if opt_config ["search_mode" ] == "dense_sparse" :
196188 insert_dense_sparse (qdrant , collection_name , sample )
197189 elif opt_config ["search_mode" ] == "dense_sparse_colbert" :
0 commit comments