Skip to content

Commit d7e83b2

Browse files
committed
should work?
1 parent 293022e commit d7e83b2

2 files changed

Lines changed: 139 additions & 31 deletions

File tree

src/mmore/run_index_api.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,19 @@
3434
logger = logging.getLogger(__name__)
3535

3636

37+
def _apply_uploaded_file_metadata(documents, file_id: str, filename: str) -> None:
38+
"""Bind processed chunks to the API file ID and persist the original filename."""
39+
for doc in documents:
40+
default_doc_id = doc.document_id
41+
doc.document_id = file_id
42+
if default_doc_id and doc.id.startswith(default_doc_id):
43+
doc.id = f"{file_id}{doc.id[len(default_doc_id) :]}"
44+
else:
45+
doc.id = file_id
46+
47+
doc.metadata.extra["filename"] = filename
48+
49+
3750
def make_router(config_path: str) -> APIRouter:
3851
router = APIRouter()
3952

@@ -97,10 +110,7 @@ async def upload_file(
97110
temp_dir, COLLECTION_NAME, [file_extension]
98111
)
99112

100-
for doc in documents:
101-
defDocId = doc.document_id
102-
doc.document_id = fileId
103-
doc.id = doc.id.replace(defDocId, fileId)
113+
_apply_uploaded_file_metadata(documents, fileId, file.filename)
104114

105115
# Get indexer and index the document
106116
try:
@@ -257,9 +267,8 @@ async def update_file(
257267
temp_dir, COLLECTION_NAME, [file_extension]
258268
)
259269

260-
# Set the custom ID
261-
for doc in documents:
262-
doc.id = fileId
270+
# Set the custom ID and preserve the original upload filename
271+
_apply_uploaded_file_metadata(documents, fileId, file.filename)
263272

264273
# Get indexer and reindex the document
265274
try:

tests/test_live_retriever_api.py

Lines changed: 123 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,14 @@
1818

1919
from mmore.index.indexer import Indexer
2020
from mmore.rag.model import DenseModelConfig, SparseModelConfig
21-
from mmore.run_index_api import make_router as make_index_router
21+
from mmore.run_index_api import (
22+
_apply_uploaded_file_metadata,
23+
)
24+
from mmore.run_index_api import (
25+
make_router as make_index_router,
26+
)
2227
from mmore.run_retriever import make_router, save_results
23-
from mmore.type import MultimodalSample
28+
from mmore.type import DocumentMetadata, MultimodalSample
2429

2530
_COLLECTION = "my_docs"
2631

@@ -216,13 +221,16 @@ def test_save_results_writes_valid_json(tmp_path):
216221
docs = [
217222
Document(
218223
page_content="Paris is the capital.",
219-
metadata={
220-
"rank": 1,
221-
"similarity": 0.9,
222-
"id": "1",
223-
"page_numbers": [],
224-
"paragraph_numbers": [],
225-
},
224+
metadata=DocumentMetadata(
225+
file_path="paris.txt",
226+
extra={
227+
"rank": 1,
228+
"similarity": 0.9,
229+
"id": "1",
230+
"page_numbers": [],
231+
"paragraph_numbers": [],
232+
},
233+
).to_dict(),
226234
)
227235
]
228236
results = [docs]
@@ -246,25 +254,31 @@ def test_save_results_multiple_queries(tmp_path):
246254
[
247255
Document(
248256
page_content="doc A",
249-
metadata={
250-
"rank": 1,
251-
"similarity": 0.8,
252-
"id": "a",
253-
"page_numbers": [],
254-
"paragraph_numbers": [],
255-
},
257+
metadata=DocumentMetadata(
258+
file_path="doc-a.txt",
259+
extra={
260+
"rank": 1,
261+
"similarity": 0.8,
262+
"id": "a",
263+
"page_numbers": [],
264+
"paragraph_numbers": [],
265+
},
266+
).to_dict(),
256267
)
257268
],
258269
[
259270
Document(
260271
page_content="doc B",
261-
metadata={
262-
"rank": 1,
263-
"similarity": 0.7,
264-
"id": "b",
265-
"page_numbers": [],
266-
"paragraph_numbers": [],
267-
},
272+
metadata=DocumentMetadata(
273+
file_path="doc-b.txt",
274+
extra={
275+
"rank": 1,
276+
"similarity": 0.7,
277+
"id": "b",
278+
"page_numbers": [],
279+
"paragraph_numbers": [],
280+
},
281+
).to_dict(),
268282
)
269283
],
270284
]
@@ -294,10 +308,20 @@ def _fake_doc(file_path: str, document_id: str = "doc") -> MultimodalSample:
294308
document_id=document_id,
295309
text="Test document content.",
296310
modalities=[],
297-
metadata={"file_path": file_path},
311+
metadata=DocumentMetadata(file_path=file_path),
298312
)
299313

300314

315+
def test_apply_uploaded_file_metadata_preserves_chunk_suffix():
316+
doc = _fake_doc("/tmp/original-name.txt", document_id="default-doc")
317+
318+
_apply_uploaded_file_metadata([doc], "client-doc", "original-name.txt")
319+
320+
assert doc.document_id == "client-doc"
321+
assert doc.id == "client-doc+0"
322+
assert doc.metadata.extra["filename"] == "original-name.txt"
323+
324+
301325
@pytest.fixture(scope="module")
302326
def indexer_client(tmp_path_factory):
303327
"""Builds the indexer FastAPI app."""
@@ -392,6 +416,81 @@ def test_upload_file_success(indexer_client):
392416
assert Path(upload_dir, "new-doc").exists()
393417

394418

419+
def test_uploaded_file_has_filename_in_list_files(tmp_path):
420+
upload_dir = tmp_path / "uploads"
421+
upload_dir.mkdir()
422+
db_path = str(tmp_path / "uploaded_list_files.db")
423+
config_file = tmp_path / "config.yaml"
424+
cfg = {
425+
"db": {"uri": db_path, "name": "my_db"},
426+
"hybrid_search_weight": 0.5,
427+
"k": 2,
428+
"collection_name": _COLLECTION,
429+
"use_web": False,
430+
"reranker_model_name": None,
431+
}
432+
with open(config_file, "w") as f:
433+
yaml.dump(cfg, f)
434+
435+
with ExitStack() as stack:
436+
stack.enter_context(
437+
patch(
438+
"mmore.index.indexer.SparseModel.from_config",
439+
return_value=FakeSparseEmbedding(),
440+
)
441+
)
442+
milvus_client = MilvusClient(db_path, enable_sparse=True)
443+
the_indexer = Indexer(
444+
dense_model_config=DenseModelConfig(model_name="debug"),
445+
sparse_model_config=SparseModelConfig(
446+
model_name="naver/splade-cocondenser-selfdistil"
447+
),
448+
client=milvus_client,
449+
)
450+
stack.enter_context(patch("mmore.run_index_api.UPLOAD_DIR", str(upload_dir)))
451+
stack.enter_context(patch("mmore.run_index_api.register_all_processors"))
452+
stack.enter_context(
453+
patch("mmore.run_index_api.get_indexer", return_value=the_indexer)
454+
)
455+
456+
index_app = FastAPI()
457+
index_app.include_router(make_index_router(str(config_file)))
458+
index_client = TestClient(index_app, raise_server_exceptions=False)
459+
460+
uploaded_path = str(upload_dir / "listed-doc.txt")
461+
stack.enter_context(
462+
patch(
463+
"mmore.run_index_api.process_files_default",
464+
return_value=[_fake_doc(uploaded_path)],
465+
)
466+
)
467+
response = index_client.post(
468+
"/v1/files",
469+
data={"fileId": "listed-doc"},
470+
files={"file": ("listed-doc.txt", b"Hello list files", "text/plain")},
471+
)
472+
assert response.status_code == 201
473+
474+
stack.enter_context(
475+
patch(
476+
"mmore.rag.retriever.SparseModel.from_config",
477+
return_value=FakeSparseEmbedding(),
478+
)
479+
)
480+
retriever_app = FastAPI()
481+
retriever_app.include_router(make_router(str(config_file)))
482+
retriever_client = TestClient(retriever_app)
483+
484+
response = retriever_client.get(
485+
"/list_files", params={"collection_name": _COLLECTION}
486+
)
487+
488+
assert response.status_code == 200
489+
files_by_id = {file["id"]: file["filename"] for file in response.json()}
490+
assert files_by_id["listed-doc"] == "listed-doc.txt"
491+
assert files_by_id["listed-doc"] != "Unknown"
492+
493+
395494
def test_upload_duplicate_file_returns_400(indexer_client):
396495
tc, upload_dir, _ = indexer_client
397496
duplicate_id = "duplicate-doc"

0 commit comments

Comments
 (0)