Skip to content

Commit c327073

Browse files
authored
Merge pull request #2591 from danielaskdd/fix-upload-duplication
Fix: Content Duplicate Detection for Document Upload Now Trackable
2 parents 3c492bb + 9c77b86 commit c327073

File tree

2 files changed

+64
-7
lines changed

2 files changed

+64
-7
lines changed

lightrag/api/routers/document_routes.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2080,13 +2080,41 @@ async def upload_to_input_dir(
20802080
uploaded file is of a supported type, saves it in the specified input directory,
20812081
indexes it for retrieval, and returns a success status with relevant details.
20822082
2083+
**Duplicate Detection Behavior:**
2084+
2085+
This endpoint handles two types of duplicate scenarios differently:
2086+
2087+
1. **Filename Duplicate (Synchronous Detection)**:
2088+
- Detected immediately before file processing
2089+
- Returns `status="duplicated"` with the existing document's track_id
2090+
- Two cases:
2091+
- If filename exists in document storage: returns existing track_id
2092+
- If filename exists in file system only: returns empty track_id ("")
2093+
2094+
2. **Content Duplicate (Asynchronous Detection)**:
2095+
- Detected during background processing after content extraction
2096+
- Returns `status="success"` with a new track_id immediately
2097+
- The duplicate is detected later when processing the file content
2098+
- Use `/documents/track_status/{track_id}` to check the final result:
2099+
- Document will have `status="FAILED"`
2100+
- `error_msg` contains "Content already exists. Original doc_id: xxx"
2101+
- `metadata.is_duplicate=true` with reference to original document
2102+
- `metadata.original_doc_id` points to the existing document
2103+
- `metadata.original_track_id` shows the original upload's track_id
2104+
2105+
**Why Different Behavior?**
2106+
- Filename check is fast (simple lookup), done synchronously
2107+
- Content extraction is expensive (PDF/DOCX parsing), done asynchronously
2108+
- This design prevents blocking the client during expensive operations
2109+
20832110
Args:
20842111
background_tasks: FastAPI BackgroundTasks for async processing
20852112
file (UploadFile): The file to be uploaded. It must have an allowed extension.
20862113
20872114
Returns:
20882115
InsertResponse: A response object containing the upload status and a message.
2089-
status can be "success", "duplicated", or error is thrown.
2116+
- status="success": File accepted and queued for processing
2117+
- status="duplicated": Filename already exists (see track_id for existing document)
20902118
20912119
Raises:
20922120
HTTPException: If the file type is not supported (400) or other errors occur (500).

lightrag/lightrag.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1361,17 +1361,46 @@ async def apipeline_enqueue_documents(
13611361
# Exclude IDs of documents that are already enqueued
13621362
unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
13631363

1364-
# Log ignored document IDs (documents that were filtered out because they already exist)
1364+
# Handle duplicate documents - create trackable records with current track_id
13651365
ignored_ids = list(all_new_doc_ids - unique_new_doc_ids)
13661366
if ignored_ids:
1367+
duplicate_docs: dict[str, Any] = {}
13671368
for doc_id in ignored_ids:
13681369
file_path = new_docs.get(doc_id, {}).get("file_path", "unknown_source")
1369-
logger.warning(
1370-
f"Ignoring document ID (already exists): {doc_id} ({file_path})"
1370+
logger.warning(f"Duplicate document detected: {doc_id} ({file_path})")
1371+
1372+
# Get existing document info for reference
1373+
existing_doc = await self.doc_status.get_by_id(doc_id)
1374+
existing_status = (
1375+
existing_doc.get("status", "unknown") if existing_doc else "unknown"
1376+
)
1377+
existing_track_id = (
1378+
existing_doc.get("track_id", "") if existing_doc else ""
13711379
)
1372-
if len(ignored_ids) > 3:
1373-
logger.warning(
1374-
f"Total Ignoring {len(ignored_ids)} document IDs that already exist in storage"
1380+
1381+
# Create a new record with unique ID for this duplicate attempt
1382+
dup_record_id = compute_mdhash_id(f"{doc_id}-{track_id}", prefix="dup-")
1383+
duplicate_docs[dup_record_id] = {
1384+
"status": DocStatus.FAILED,
1385+
"content_summary": f"[DUPLICATE] Original document: {doc_id}",
1386+
"content_length": new_docs.get(doc_id, {}).get("content_length", 0),
1387+
"created_at": datetime.now(timezone.utc).isoformat(),
1388+
"updated_at": datetime.now(timezone.utc).isoformat(),
1389+
"file_path": file_path,
1390+
"track_id": track_id, # Use current track_id for tracking
1391+
"error_msg": f"Content already exists. Original doc_id: {doc_id}, Status: {existing_status}",
1392+
"metadata": {
1393+
"is_duplicate": True,
1394+
"original_doc_id": doc_id,
1395+
"original_track_id": existing_track_id,
1396+
},
1397+
}
1398+
1399+
# Store duplicate records in doc_status
1400+
if duplicate_docs:
1401+
await self.doc_status.upsert(duplicate_docs)
1402+
logger.info(
1403+
f"Created {len(duplicate_docs)} duplicate document records with track_id: {track_id}"
13751404
)
13761405

13771406
# Filter new_docs to only include documents with unique IDs

0 commit comments

Comments
 (0)