44import shutil
55import tempfile
66from pathlib import Path as FilePath
7- from typing import List , cast
7+ from typing import List
88
99import uvicorn
1010from fastapi import APIRouter , FastAPI , File , Form , HTTPException , Path , UploadFile
2323
2424from .process .processors import register_all_processors
2525from .rag .retriever import RetrieverConfig
26+ from .type import MultimodalSample
2627from .utils import get_indexer , load_config , process_files_default
2728
2829UPLOAD_DIR : str = "./uploads"
3435logger = logging .getLogger (__name__ )
3536
3637
38+ def _apply_uploaded_file_metadata (
39+ documents : List [MultimodalSample ], file_id : str , filename : str
40+ ) -> None :
41+ """Bind processed chunks to the API file ID and persist the original filename."""
42+ for doc in documents :
43+ chunk_id = doc .id .rsplit ("+" )[1 ] if "+" in doc .id else None
44+ doc .document_id = file_id
45+ doc .id = f"{ file_id } +{ chunk_id } " if chunk_id else file_id
46+
47+ doc .metadata .extra ["filename" ] = filename
48+
49+
3750def make_router (config_path : str ) -> APIRouter :
3851 router = APIRouter ()
3952
@@ -109,10 +122,13 @@ async def upload_file(
109122 os .makedirs (os .path .dirname (file_storage_path ), exist_ok = True )
110123 shutil .copy2 (temp_file_path , file_storage_path )
111124
112- for doc in documents :
113- defDocId = doc .document_id
114- doc .document_id = fileId
115- doc .id = doc .id .replace (defDocId , fileId )
125+ # Process and index the file
126+ file_extension = FilePath (file .filename ).suffix .lower ()
127+ documents = process_files_default (
128+ temp_dir , COLLECTION_NAME , [file_extension ]
129+ )
130+
131+ _apply_uploaded_file_metadata (documents , fileId , file .filename )
116132
117133 # Get indexer and index the document
118134 try :
@@ -148,7 +164,12 @@ async def upload_files(
148164 Upload multiple files with custom IDs and index them.
149165 """
150166 try :
151- listIds = listIds [0 ].split ("," )
167+ listIds = [
168+ file_id .strip ()
169+ for ids in listIds
170+ for file_id in ids .split ("," )
171+ if file_id .strip ()
172+ ]
152173 # Check if IDs and files match in number
153174 if len (listIds ) != len (files ):
154175 raise HTTPException (
@@ -159,13 +180,15 @@ async def upload_files(
159180 with tempfile .TemporaryDirectory () as temp_dir :
160181 logging .info (f"Starting to process { len (files )} files with custom IDs" )
161182
162- temp_paths : List [FilePath ] = []
163- for file , file_id in zip (files , listIds ):
183+ uploaded_files : list [dict [str , str ]] = []
184+ file_info_by_temp_path = {}
185+ for index , (file , file_id ) in enumerate (zip (files , listIds )):
164186 if file .filename is None :
165187 raise HTTPException (
166188 status_code = 422 ,
167189 detail = f"File { file_id } does not have a filename" ,
168190 )
191+ filename = file .filename
169192
170193 # Check if file with this ID already exists
171194 file_storage_path = FilePath (UPLOAD_DIR ) / file_id
@@ -176,10 +199,19 @@ async def upload_files(
176199 )
177200
178201 # Save to temp directory
179- file_name = FilePath (temp_dir ) / f"{ file_id } _{ file .filename } "
180- with file_name .open ("wb" ) as buffer :
202+ temp_file_path = (
203+ FilePath (temp_dir ) / f"{ index } { FilePath (filename ).suffix } "
204+ )
205+ file_info = {
206+ "fileId" : file_id ,
207+ "filename" : filename ,
208+ "temp_path" : str (temp_file_path .resolve ()),
209+ }
210+ uploaded_files .append (file_info )
211+ file_info_by_temp_path [file_info ["temp_path" ]] = file_info
212+
213+ with temp_file_path .open ("wb" ) as buffer :
181214 shutil .copyfileobj (file .file , buffer )
182- temp_paths .append (file_name )
183215
184216 # Close the file
185217 await file .close ()
@@ -188,7 +220,8 @@ async def upload_files(
188220
189221 # Process the documents
190222 file_extensions = [
191- FilePath (cast (str , file .filename )).suffix .lower () for file in files
223+ FilePath (file_info ["temp_path" ]).suffix .lower ()
224+ for file_info in uploaded_files
192225 ]
193226 try :
194227 documents = process_files_default (
@@ -206,16 +239,34 @@ async def upload_files(
206239 ) from e
207240
208241 # Save permanent copies
209- for temp_path , file_id in zip ( temp_paths , listIds ) :
210- file_storage_path = FilePath (UPLOAD_DIR ) / file_id
211- shutil .copy2 (temp_path , file_storage_path )
242+ for file_info in uploaded_files :
243+ file_storage_path = FilePath (UPLOAD_DIR ) / file_info [ "fileId" ]
244+ shutil .copy2 (file_info [ " temp_path" ] , file_storage_path )
212245
213246 # Change the IDs to match the ones from the client
214247 modified_documents = []
215- for doc , docId in zip (documents , listIds ):
216- defDocId = doc .document_id
217- doc .document_id = docId
218- doc .id = doc .id .replace (defDocId , docId )
248+ text_by_file_id = {}
249+ chunks_by_file_id = {
250+ file_info ["fileId" ]: 0 for file_info in uploaded_files
251+ }
252+ for doc_index , doc in enumerate (documents ):
253+ doc_temp_path = str (FilePath (doc .metadata .file_path ).resolve ())
254+ file_info = file_info_by_temp_path .get (doc_temp_path )
255+ if file_info is None :
256+ if doc_index >= len (uploaded_files ):
257+ raise HTTPException (
258+ status_code = 500 ,
259+ detail = (
260+ "Could not match processed document "
261+ f"{ doc .metadata .file_path } to an uploaded file"
262+ ),
263+ )
264+ # Fallback for processors/tests that return file paths outside temp_dir.
265+ file_info = uploaded_files [doc_index ]
266+ doc_id = file_info ["fileId" ]
267+ _apply_uploaded_file_metadata ([doc ], doc_id , file_info ["filename" ])
268+ text_by_file_id .setdefault (doc_id , doc .text )
269+ chunks_by_file_id [doc_id ] += 1
219270 modified_documents .append (doc )
220271
221272 logging .info ("Indexing the files" )
@@ -232,10 +283,16 @@ async def upload_files(
232283
233284 return {
234285 "status" : "success" ,
235- "message" : f"Successfully processed and indexed { len (modified_documents )} documents " ,
286+ "message" : f"Successfully processed and indexed { len (uploaded_files )} files " ,
236287 "documents" : [
237- {"fileId" : doc .document_id , "text" : doc .text [:50 ] + "..." }
238- for doc in modified_documents
288+ {
289+ "fileId" : file_info ["fileId" ],
290+ "filename" : file_info ["filename" ],
291+ "text" : text_by_file_id .get (file_info ["fileId" ], "" )[:50 ]
292+ + "..." ,
293+ "chunks" : chunks_by_file_id [file_info ["fileId" ]],
294+ }
295+ for file_info in uploaded_files
239296 ],
240297 }
241298
@@ -284,9 +341,8 @@ async def update_file(
284341 temp_dir , COLLECTION_NAME , [file_extension ]
285342 )
286343
287- # Set the custom ID
288- for doc in documents :
289- doc .id = fileId
344+ # Set the custom ID and preserve the original upload filename
345+ _apply_uploaded_file_metadata (documents , fileId , file .filename )
290346
291347 # Get indexer and reindex the document
292348 try :
0 commit comments