@@ -111,6 +111,11 @@ def add_documents(
111111 ) -> List [str ]:
112112 """
113113 Add documents to the collection.
114+ Validates metadata against rag.txt requirements:
115+ - chunk_id
116+ - document_id
117+ - source
118+ - position
114119
115120 Args:
116121 texts: List of text content
@@ -123,10 +128,18 @@ def add_documents(
123128 if not texts :
124129 logger .warning ("No texts provided to add" )
125130 return []
131+
132+ # Validate metadata fields
133+ required_fields = {"chunk_id" , "document_id" , "source" , "position" }
134+ for idx , meta in enumerate (metadatas ):
135+ missing = required_fields - meta .keys ()
136+ if missing :
137+ logger .error (f"Metadata at index { idx } missing required fields: { missing } " )
138+ raise ValueError (f"Metadata missing required fields: { missing } " )
126139
127140 # Generate IDs if not provided
128141 if ids is None :
129- ids = [f"chunk_{ uuid .uuid4 ().hex [:12 ]} " for _ in texts ]
142+ ids = [meta . get ( "chunk_id" , f"chunk_{ uuid .uuid4 ().hex [:12 ]} " ) for meta in metadatas ]
130143
131144 # Generate embeddings
132145 embeddings = self .embedding_service .embed_texts (texts )
@@ -150,48 +163,54 @@ def add_documents(
150163
151164 def query (
152165 self ,
153- query_text : str ,
166+ query_text : Optional [str ] = None ,
167+ query_texts : Optional [List [str ]] = None ,
154168 n_results : int = 3 ,
155- where : Optional [Dict [str , Any ]] = None
169+ where : Optional [Dict [str , Any ]] = None ,
170+ include : Optional [List [str ]] = None
156171 ) -> Dict [str , Any ]:
157172 """
158173 Query the collection for similar documents.
159174
160175 Args:
161- query_text: Text to search for
176+ query_text: Single text to search for (legacy support)
177+ query_texts: List of texts to search for (preferred)
162178 n_results: Number of results to return
163179 where: Optional filter conditions
180+ include: Optional list of fields to include
164181
165182 Returns:
166183 Dict with documents, metadatas, and distances
167184 """
168- if not query_text or not query_text .strip ():
169- return {"documents" : [], "metadatas" : [], "distances" : []}
185+ # Support both query_text and query_texts
186+ texts = query_texts if query_texts else ([query_text ] if query_text else [])
187+
188+ if not texts or all (not t or not t .strip () for t in texts ):
189+ return {"documents" : [], "metadatas" : [], "distances" : [], "ids" : []}
190+
191+ # Generate query embeddings
192+ query_embeddings = self .embedding_service .embed_texts (texts )
170193
171- # Generate query embedding
172- query_embedding = self .embedding_service .embed_text (query_text )
194+ if not query_embeddings :
195+ logger .error ("Failed to generate query embeddings" )
196+ return {"documents" : [], "metadatas" : [], "distances" : [], "ids" : []}
173197
174- if not query_embedding :
175- logger .error ("Failed to generate query embedding" )
176- return {"documents" : [], "metadatas" : [], "distances" : []}
198+ default_include = ["documents" , "metadatas" , "distances" ]
177199
178200 try :
179201 results = self .collection .query (
180- query_embeddings = [ query_embedding ] ,
202+ query_embeddings = query_embeddings ,
181203 n_results = n_results ,
182204 where = where ,
183- include = [ "documents" , "metadatas" , "distances" ]
205+ include = include if include else default_include
184206 )
185207
186- # Flatten results (query returns nested lists)
187- return {
188- "documents" : results .get ("documents" , [[]])[0 ],
189- "metadatas" : results .get ("metadatas" , [[]])[0 ],
190- "distances" : results .get ("distances" , [[]])[0 ],
191- }
208+ # Return raw results structure which contains lists of lists
209+ # The caller handles flattening if needed
210+ return results
192211 except Exception as e :
193212 logger .error (f"Error querying ChromaDB: { e } " )
194- return {"documents" : [], "metadatas" : [], "distances" : []}
213+ return {"documents" : [], "metadatas" : [], "distances" : [], "ids" : [] }
195214
196215 def delete_by_document_id (self , document_id : str ) -> bool :
197216 """
@@ -240,9 +259,12 @@ def get_all_documents(self) -> List[Dict[str, Any]]:
240259 for metadata in results ["metadatas" ]:
241260 doc_id = metadata .get ("document_id" )
242261 if doc_id and doc_id not in documents :
262+ # Use 'source' field (set during ingestion) as filename
263+ source = metadata .get ("source" , "Unknown" )
243264 documents [doc_id ] = {
244265 "id" : doc_id ,
245- "filename" : metadata .get ("filename" , "Unknown" ),
266+ "filename" : source ,
267+ "source" : source ,
246268 "file_type" : metadata .get ("file_type" , "Unknown" ),
247269 "file_size" : metadata .get ("file_size" , 0 ),
248270 "chunk_count" : 0 ,
@@ -300,3 +322,6 @@ def get_chroma_service() -> ChromaService:
300322 _chroma_service = ChromaService ()
301323 return _chroma_service
302324
325+ def get_chroma_client () -> ChromaService :
326+ """Legacy alias for get_chroma_service."""
327+ return get_chroma_service ()
0 commit comments