Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 0 additions & 47 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from api.db import FileType
from api.db.db_models import Task
from api.db.services import duplicate_name
from api.db.services.doc_metadata_service import DocMetadataService

Check failure on line 28 in api/apps/document_app.py

View workflow job for this annotation

GitHub Actions / ragflow_tests

ruff (F401)

api/apps/document_app.py:28:50: F401 `api.db.services.doc_metadata_service.DocMetadataService` imported but unused help: Remove unused import: `api.db.services.doc_metadata_service.DocMetadataService`
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
Expand Down Expand Up @@ -183,53 +183,6 @@
return server_error_response(e)


@manager.route("/metadata/update", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids")
async def metadata_update():
req = await get_request_json()
kb_id = req.get("kb_id")
document_ids = req.get("doc_ids")
updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or []

if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)

if not isinstance(updates, list) or not isinstance(deletes, list):
return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR)

for upd in updates:
if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd:
return get_json_result(data=False, message="Each update requires key and value.", code=RetCode.ARGUMENT_ERROR)
for d in deletes:
if not isinstance(d, dict) or not d.get("key"):
return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR)

updated = DocMetadataService.batch_update_metadata(kb_id, document_ids, updates, deletes)
return get_json_result(data={"updated": updated, "matched_docs": len(document_ids)})


@manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id", "metadata")
async def update_metadata_setting():
req = await get_request_json()
if not DocumentService.accessible(req["doc_id"], current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)

e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")

DocumentService.update_parser_config(doc.id, {"metadata": req["metadata"]})
e, doc = DocumentService.get_by_id(doc.id)
if not e:
return get_data_error_result(message="Document not found!")

return get_json_result(data=doc.to_dict())


@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
# @login_required
def thumbnails():
Expand Down
216 changes: 210 additions & 6 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,15 +264,15 @@ async def upload_document(dataset_id, tenant_id):
"""
from api.constants import FILE_NAME_LEN_LIMIT
from api.db.services.file_service import FileService

form = await request.form
files = await request.files

# Validation
if "file" not in files:
logging.error("No file part!")
return get_error_data_result(message="No file part!", code=RetCode.ARGUMENT_ERROR)

file_objs = files.getlist("file")
for file_obj in file_objs:
if file_obj is None or file_obj.filename is None or file_obj.filename == "":
Expand All @@ -288,7 +288,7 @@ async def upload_document(dataset_id, tenant_id):
if not e:
logging.error(f"Can't find the dataset with ID {dataset_id}!")
return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR)

# Permission Check
if not check_kb_team_permission(kb, tenant_id):
logging.error("No authorization.")
Expand All @@ -308,7 +308,7 @@ async def upload_document(dataset_id, tenant_id):
msg = "There seems to be an issue with your file format. please verify it is correct and not corrupted."
logging.error(msg)
return get_error_data_result(message=msg, code=RetCode.DATA_ERROR)

files = [f[0] for f in files] # remove the blob

# Check if we should return raw files without document key mapping
Expand Down Expand Up @@ -580,7 +580,7 @@ def _parse_doc_id_filter_with_metadata(req, kb_id):
- The metadata_condition uses operators like: =, !=, >, <, >=, <=, contains, not contains,
in, not in, start with, end with, empty, not empty.
- The metadata parameter performs exact matching where values are OR'd within the same key
and AND'd across different keys.
& AND'd across different keys.

Examples:
Simple metadata filter (exact match):
Expand Down Expand Up @@ -758,6 +758,8 @@ async def delete_documents(tenant_id, dataset_id):
except Exception as e:
logging.exception(e)
return get_error_data_result(message="Internal server error")


def _aggregate_filters(docs):
"""Aggregate filter options from a list of documents.

Expand Down Expand Up @@ -815,3 +817,205 @@ def _aggregate_filters(docs):
"run_status": run_status_counter,
"metadata": metadata_counter,
}

@manager.route("/datasets/<dataset_id>/documents/<document_id>/metadata/config", methods=["PUT"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def update_metadata_config(tenant_id, dataset_id, document_id):
"""
Update document metadata configuration.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: path
name: document_id
type: string
required: true
description: ID of the document.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Metadata configuration.
required: true
schema:
type: object
properties:
metadata:
type: object
description: Metadata configuration JSON.
responses:
200:
description: Document updated successfully.
"""
# Verify ownership and existence of dataset
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(message="You don't own the dataset.")

# Verify document exists in the dataset
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
if not doc:
msg = f"Document {document_id} not found in dataset {dataset_id}"
return get_error_data_result(message=msg)
doc = doc[0]

# Get request body
req = await get_request_json()
if "metadata" not in req:
return get_error_argument_result(message="metadata is required")

# Update parser config with metadata
try:
DocumentService.update_parser_config(doc.id, {"metadata": req["metadata"]})
except Exception as e:
logging.error("error when update_parser_config", exc_info=e)
return get_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e))

# Get updated document
try:
e, doc = DocumentService.get_by_id(doc.id)
if not e:
return get_data_error_result(message="Document not found!")
except Exception as e:
return get_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e))

return get_result(data=doc.to_dict())


@manager.route("/datasets/<dataset_id>/documents/metadatas", methods=["PATCH"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def update_metadata(tenant_id, dataset_id):
"""
Update document metadata in batch.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Metadata update request.
required: true
schema:
type: object
properties:
selector:
type: object
description: Document selector.
properties:
document_ids:
type: array
items:
type: string
description: List of document IDs to update.
metadata_condition:
type: object
description: Filter documents by existing metadata.
updates:
type: array
items:
type: object
properties:
key:
type: string
value:
type: any
description: List of metadata key-value pairs to update.
deletes:
type: array
items:
type: object
properties:
key:
type: string
description: List of metadata keys to delete.
responses:
200:
description: Metadata updated successfully.
"""
# Verify ownership of dataset
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")

# Get request body
req = await get_request_json()
selector = req.get("selector", {}) or {}
updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or []

# Validate selector
if not isinstance(selector, dict):
return get_error_data_result(message="selector must be an object.")
if not isinstance(updates, list) or not isinstance(deletes, list):
return get_error_data_result(message="updates and deletes must be lists.")

# Validate metadata_condition
metadata_condition = selector.get("metadata_condition", {}) or {}
if metadata_condition and not isinstance(metadata_condition, dict):
return get_error_data_result(message="metadata_condition must be an object.")

# Validate document_ids
document_ids = selector.get("document_ids", []) or []
if document_ids and not isinstance(document_ids, list):
return get_error_data_result(message="document_ids must be a list.")

# Validate updates
for upd in updates:
if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd:
return get_error_data_result(message="Each update requires key and value.")

# Validate deletes
for d in deletes:
if not isinstance(d, dict) or not d.get("key"):
return get_error_data_result(message="Each delete requires key.")

# Initialize target document IDs
target_doc_ids = set()

# If document_ids provided, validate they belong to the dataset
if document_ids:
kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
invalid_ids = set(document_ids) - set(kb_doc_ids)
if invalid_ids:
return get_error_data_result(
message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}"
)
target_doc_ids = set(document_ids)

# Apply metadata_condition filtering if provided
if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(
meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
)
target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})
Comment on lines +1008 to +1016
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Handle metadata_condition without document_ids.

When the selector only contains metadata_condition, target_doc_ids is still an empty set, so Line 1014 turns every filtered_ids result into an empty match. Use the filtered set as the initial target when no explicit document IDs were provided.

🐛 Proposed fix
     if metadata_condition:
         metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
         filtered_ids = set(
             meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
         )
-        target_doc_ids = target_doc_ids & filtered_ids
+        target_doc_ids = target_doc_ids & filtered_ids if target_doc_ids else filtered_ids
         if metadata_condition.get("conditions") and not target_doc_ids:
             return get_result(data={"updated": 0, "matched_docs": 0})
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Apply metadata_condition filtering if provided
if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(
meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
)
target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})
# Apply metadata_condition filtering if provided
if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(
meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
)
target_doc_ids = target_doc_ids & filtered_ids if target_doc_ids else filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 1008 - 1016, The
metadata-only selector path incorrectly intersects filtered_ids with an
initially empty target_doc_ids, producing no matches; update the logic in
document_api.py so that when metadata_condition is present and target_doc_ids is
empty, you set target_doc_ids = filtered_ids (instead of intersecting),
otherwise keep the intersection; use DocMetadataService.get_flatted_meta_by_kbs,
meta_filter, and convert_conditions as currently used to compute filtered_ids,
and preserve the existing early return that calls get_result({"updated": 0,
"matched_docs": 0}) when metadata_condition has conditions and target_doc_ids
becomes empty after filtering.


# Convert to list and perform update
target_doc_ids = list(target_doc_ids)
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
51 changes: 0 additions & 51 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,57 +157,6 @@ async def download_doc(document_id):
)


@manager.route("/datasets/<dataset_id>/metadata/update", methods=["POST"]) # noqa: F821
@token_required
async def metadata_batch_update(dataset_id, tenant_id):
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")

req = await get_request_json()
selector = req.get("selector", {}) or {}
updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or []

if not isinstance(selector, dict):
return get_error_data_result(message="selector must be an object.")
if not isinstance(updates, list) or not isinstance(deletes, list):
return get_error_data_result(message="updates and deletes must be lists.")

metadata_condition = selector.get("metadata_condition", {}) or {}
if metadata_condition and not isinstance(metadata_condition, dict):
return get_error_data_result(message="metadata_condition must be an object.")

document_ids = selector.get("document_ids", []) or []
if document_ids and not isinstance(document_ids, list):
return get_error_data_result(message="document_ids must be a list.")

for upd in updates:
if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd:
return get_error_data_result(message="Each update requires key and value.")
for d in deletes:
if not isinstance(d, dict) or not d.get("key"):
return get_error_data_result(message="Each delete requires key.")

if document_ids:
kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
target_doc_ids = set(kb_doc_ids)
invalid_ids = set(document_ids) - set(kb_doc_ids)
if invalid_ids:
return get_error_data_result(message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}")
target_doc_ids = set(document_ids)

if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})

target_doc_ids = list(target_doc_ids)
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})


DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed"
DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"

Expand Down
10 changes: 10 additions & 0 deletions test/testcases/test_http_api/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,16 @@ def metadata_batch_update(auth, dataset_id, payload=None):
return res.json()


def update_documents_metadata(auth, dataset_id, payload=None):
"""New unified API for updating document metadata.

Uses PATCH method at /api/v1/datasets/{dataset_id}/documents/metadatas
"""
url = f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}/documents/metadatas"
res = requests.patch(url=url, headers=HEADERS, auth=auth, json=payload)
return res.json()


# CHAT COMPLETIONS AND RELATED QUESTIONS
def related_questions(auth, payload=None):
url = f"{HOST_ADDRESS}/api/{VERSION}/sessions/related_questions"
Expand Down
Loading
Loading