Skip to content

Commit 768d225

Browse files
fabnemEPFLCopilotCopilot
authored
Fix #288 (#307)
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: Copilot <copilot@github.com>
1 parent 1ac412e commit 768d225

2 files changed

Lines changed: 262 additions & 56 deletions

File tree

src/mmore/run_index_api.py

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import shutil
55
import tempfile
66
from pathlib import Path as FilePath
7-
from typing import List, cast
7+
from typing import List
88

99
import uvicorn
1010
from fastapi import APIRouter, FastAPI, File, Form, HTTPException, Path, UploadFile
@@ -23,6 +23,7 @@
2323

2424
from .process.processors import register_all_processors
2525
from .rag.retriever import RetrieverConfig
26+
from .type import MultimodalSample
2627
from .utils import get_indexer, load_config, process_files_default
2728

2829
UPLOAD_DIR: str = "./uploads"
@@ -34,6 +35,18 @@
3435
logger = logging.getLogger(__name__)
3536

3637

38+
def _apply_uploaded_file_metadata(
39+
documents: List[MultimodalSample], file_id: str, filename: str
40+
) -> None:
41+
"""Bind processed chunks to the API file ID and persist the original filename."""
42+
for doc in documents:
43+
chunk_id = doc.id.rsplit("+")[1] if "+" in doc.id else None
44+
doc.document_id = file_id
45+
doc.id = f"{file_id}+{chunk_id}" if chunk_id else file_id
46+
47+
doc.metadata.extra["filename"] = filename
48+
49+
3750
def make_router(config_path: str) -> APIRouter:
3851
router = APIRouter()
3952

@@ -109,10 +122,13 @@ async def upload_file(
109122
os.makedirs(os.path.dirname(file_storage_path), exist_ok=True)
110123
shutil.copy2(temp_file_path, file_storage_path)
111124

112-
for doc in documents:
113-
defDocId = doc.document_id
114-
doc.document_id = fileId
115-
doc.id = doc.id.replace(defDocId, fileId)
125+
# Process and index the file
126+
file_extension = FilePath(file.filename).suffix.lower()
127+
documents = process_files_default(
128+
temp_dir, COLLECTION_NAME, [file_extension]
129+
)
130+
131+
_apply_uploaded_file_metadata(documents, fileId, file.filename)
116132

117133
# Get indexer and index the document
118134
try:
@@ -148,7 +164,12 @@ async def upload_files(
148164
Upload multiple files with custom IDs and index them.
149165
"""
150166
try:
151-
listIds = listIds[0].split(",")
167+
listIds = [
168+
file_id.strip()
169+
for ids in listIds
170+
for file_id in ids.split(",")
171+
if file_id.strip()
172+
]
152173
# Check if IDs and files match in number
153174
if len(listIds) != len(files):
154175
raise HTTPException(
@@ -159,13 +180,15 @@ async def upload_files(
159180
with tempfile.TemporaryDirectory() as temp_dir:
160181
logging.info(f"Starting to process {len(files)} files with custom IDs")
161182

162-
temp_paths: List[FilePath] = []
163-
for file, file_id in zip(files, listIds):
183+
uploaded_files: list[dict[str, str]] = []
184+
file_info_by_temp_path = {}
185+
for index, (file, file_id) in enumerate(zip(files, listIds)):
164186
if file.filename is None:
165187
raise HTTPException(
166188
status_code=422,
167189
detail=f"File {file_id} does not have a filename",
168190
)
191+
filename = file.filename
169192

170193
# Check if file with this ID already exists
171194
file_storage_path = FilePath(UPLOAD_DIR) / file_id
@@ -176,10 +199,19 @@ async def upload_files(
176199
)
177200

178201
# Save to temp directory
179-
file_name = FilePath(temp_dir) / f"{file_id}_{file.filename}"
180-
with file_name.open("wb") as buffer:
202+
temp_file_path = (
203+
FilePath(temp_dir) / f"{index}{FilePath(filename).suffix}"
204+
)
205+
file_info = {
206+
"fileId": file_id,
207+
"filename": filename,
208+
"temp_path": str(temp_file_path.resolve()),
209+
}
210+
uploaded_files.append(file_info)
211+
file_info_by_temp_path[file_info["temp_path"]] = file_info
212+
213+
with temp_file_path.open("wb") as buffer:
181214
shutil.copyfileobj(file.file, buffer)
182-
temp_paths.append(file_name)
183215

184216
# Close the file
185217
await file.close()
@@ -188,7 +220,8 @@ async def upload_files(
188220

189221
# Process the documents
190222
file_extensions = [
191-
FilePath(cast(str, file.filename)).suffix.lower() for file in files
223+
FilePath(file_info["temp_path"]).suffix.lower()
224+
for file_info in uploaded_files
192225
]
193226
try:
194227
documents = process_files_default(
@@ -206,16 +239,34 @@ async def upload_files(
206239
) from e
207240

208241
# Save permanent copies
209-
for temp_path, file_id in zip(temp_paths, listIds):
210-
file_storage_path = FilePath(UPLOAD_DIR) / file_id
211-
shutil.copy2(temp_path, file_storage_path)
242+
for file_info in uploaded_files:
243+
file_storage_path = FilePath(UPLOAD_DIR) / file_info["fileId"]
244+
shutil.copy2(file_info["temp_path"], file_storage_path)
212245

213246
# Change the IDs to match the ones from the client
214247
modified_documents = []
215-
for doc, docId in zip(documents, listIds):
216-
defDocId = doc.document_id
217-
doc.document_id = docId
218-
doc.id = doc.id.replace(defDocId, docId)
248+
text_by_file_id = {}
249+
chunks_by_file_id = {
250+
file_info["fileId"]: 0 for file_info in uploaded_files
251+
}
252+
for doc_index, doc in enumerate(documents):
253+
doc_temp_path = str(FilePath(doc.metadata.file_path).resolve())
254+
file_info = file_info_by_temp_path.get(doc_temp_path)
255+
if file_info is None:
256+
if doc_index >= len(uploaded_files):
257+
raise HTTPException(
258+
status_code=500,
259+
detail=(
260+
"Could not match processed document "
261+
f"{doc.metadata.file_path} to an uploaded file"
262+
),
263+
)
264+
# Fallback for processors/tests that return file paths outside temp_dir.
265+
file_info = uploaded_files[doc_index]
266+
doc_id = file_info["fileId"]
267+
_apply_uploaded_file_metadata([doc], doc_id, file_info["filename"])
268+
text_by_file_id.setdefault(doc_id, doc.text)
269+
chunks_by_file_id[doc_id] += 1
219270
modified_documents.append(doc)
220271

221272
logging.info("Indexing the files")
@@ -232,10 +283,16 @@ async def upload_files(
232283

233284
return {
234285
"status": "success",
235-
"message": f"Successfully processed and indexed {len(modified_documents)} documents",
286+
"message": f"Successfully processed and indexed {len(uploaded_files)} files",
236287
"documents": [
237-
{"fileId": doc.document_id, "text": doc.text[:50] + "..."}
238-
for doc in modified_documents
288+
{
289+
"fileId": file_info["fileId"],
290+
"filename": file_info["filename"],
291+
"text": text_by_file_id.get(file_info["fileId"], "")[:50]
292+
+ "...",
293+
"chunks": chunks_by_file_id[file_info["fileId"]],
294+
}
295+
for file_info in uploaded_files
239296
],
240297
}
241298

@@ -284,9 +341,8 @@ async def update_file(
284341
temp_dir, COLLECTION_NAME, [file_extension]
285342
)
286343

287-
# Set the custom ID
288-
for doc in documents:
289-
doc.id = fileId
344+
# Set the custom ID and preserve the original upload filename
345+
_apply_uploaded_file_metadata(documents, fileId, file.filename)
290346

291347
# Get indexer and reindex the document
292348
try:

0 commit comments

Comments
 (0)