Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 0 additions & 28 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from api.db import FileType
from api.db.db_models import Task
from api.db.services import duplicate_name
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
Expand Down Expand Up @@ -183,33 +182,6 @@ async def create():
return server_error_response(e)


@manager.route("/metadata/update", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids")
async def metadata_update():
req = await get_request_json()
kb_id = req.get("kb_id")
document_ids = req.get("doc_ids")
updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or []

if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)

if not isinstance(updates, list) or not isinstance(deletes, list):
return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR)

for upd in updates:
if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd:
return get_json_result(data=False, message="Each update requires key and value.", code=RetCode.ARGUMENT_ERROR)
for d in deletes:
if not isinstance(d, dict) or not d.get("key"):
return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR)

updated = DocMetadataService.batch_update_metadata(kb_id, document_ids, updates, deletes)
return get_json_result(data={"updated": updated, "matched_docs": len(document_ids)})


@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
# @login_required
def thumbnails():
Expand Down
128 changes: 128 additions & 0 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,3 +891,131 @@ async def update_metadata_config(tenant_id, dataset_id, document_id):
return get_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e))

return get_result(data=doc.to_dict())


@manager.route("/datasets/<dataset_id>/documents/metadatas", methods=["PATCH"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def update_metadata(tenant_id, dataset_id):
"""
Update document metadata in batch.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Metadata update request.
required: true
schema:
type: object
properties:
selector:
type: object
description: Document selector.
properties:
document_ids:
type: array
items:
type: string
description: List of document IDs to update.
metadata_condition:
type: object
description: Filter documents by existing metadata.
updates:
type: array
items:
type: object
properties:
key:
type: string
value:
type: any
description: List of metadata key-value pairs to update.
deletes:
type: array
items:
type: object
properties:
key:
type: string
description: List of metadata keys to delete.
responses:
200:
description: Metadata updated successfully.
"""
# Verify ownership of dataset
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")

# Get request body
req = await get_request_json()
selector = req.get("selector", {}) or {}
updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or []

# Validate selector
if not isinstance(selector, dict):
return get_error_data_result(message="selector must be an object.")
if not isinstance(updates, list) or not isinstance(deletes, list):
return get_error_data_result(message="updates and deletes must be lists.")

# Validate metadata_condition
metadata_condition = selector.get("metadata_condition", {}) or {}
if metadata_condition and not isinstance(metadata_condition, dict):
return get_error_data_result(message="metadata_condition must be an object.")

# Validate document_ids
document_ids = selector.get("document_ids", []) or []
if document_ids and not isinstance(document_ids, list):
return get_error_data_result(message="document_ids must be a list.")

# Validate updates
for upd in updates:
if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd:
return get_error_data_result(message="Each update requires key and value.")

# Validate deletes
for d in deletes:
if not isinstance(d, dict) or not d.get("key"):
return get_error_data_result(message="Each delete requires key.")

# Initialize target document IDs
target_doc_ids = set()

# If document_ids provided, validate they belong to the dataset
if document_ids:
kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
invalid_ids = set(document_ids) - set(kb_doc_ids)
if invalid_ids:
return get_error_data_result(
message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}"
)
target_doc_ids = set(document_ids)

# Apply metadata_condition filtering if provided
if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(
meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
)
target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})
Comment on lines +1008 to +1016
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Handle metadata_condition without document_ids.

When the selector only contains metadata_condition, target_doc_ids is still an empty set, so Line 1014 turns every filtered_ids result into an empty match. Use the filtered set as the initial target when no explicit document IDs were provided.

🐛 Proposed fix
     if metadata_condition:
         metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
         filtered_ids = set(
             meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
         )
-        target_doc_ids = target_doc_ids & filtered_ids
+        target_doc_ids = target_doc_ids & filtered_ids if target_doc_ids else filtered_ids
         if metadata_condition.get("conditions") and not target_doc_ids:
             return get_result(data={"updated": 0, "matched_docs": 0})
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Apply metadata_condition filtering if provided
if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(
meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
)
target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})
# Apply metadata_condition filtering if provided
if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(
meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
)
target_doc_ids = target_doc_ids & filtered_ids if target_doc_ids else filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 1008 - 1016, The
metadata-only selector path incorrectly intersects filtered_ids with an
initially empty target_doc_ids, producing no matches; update the logic in
document_api.py so that when metadata_condition is present and target_doc_ids is
empty, you set target_doc_ids = filtered_ids (instead of intersecting),
otherwise keep the intersection; use DocMetadataService.get_flatted_meta_by_kbs,
meta_filter, and convert_conditions as currently used to compute filtered_ids,
and preserve the existing early return that calls get_result({"updated": 0,
"matched_docs": 0}) when metadata_condition has conditions and target_doc_ids
becomes empty after filtering.


# Convert to list and perform update
target_doc_ids = list(target_doc_ids)
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
51 changes: 0 additions & 51 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,57 +157,6 @@ async def download_doc(document_id):
)


@manager.route("/datasets/<dataset_id>/metadata/update", methods=["POST"]) # noqa: F821
@token_required
async def metadata_batch_update(dataset_id, tenant_id):
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")

req = await get_request_json()
selector = req.get("selector", {}) or {}
updates = req.get("updates", []) or []
deletes = req.get("deletes", []) or []

if not isinstance(selector, dict):
return get_error_data_result(message="selector must be an object.")
if not isinstance(updates, list) or not isinstance(deletes, list):
return get_error_data_result(message="updates and deletes must be lists.")

metadata_condition = selector.get("metadata_condition", {}) or {}
if metadata_condition and not isinstance(metadata_condition, dict):
return get_error_data_result(message="metadata_condition must be an object.")

document_ids = selector.get("document_ids", []) or []
if document_ids and not isinstance(document_ids, list):
return get_error_data_result(message="document_ids must be a list.")

for upd in updates:
if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd:
return get_error_data_result(message="Each update requires key and value.")
for d in deletes:
if not isinstance(d, dict) or not d.get("key"):
return get_error_data_result(message="Each delete requires key.")

if document_ids:
kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
target_doc_ids = set(kb_doc_ids)
invalid_ids = set(document_ids) - set(kb_doc_ids)
if invalid_ids:
return get_error_data_result(message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}")
target_doc_ids = set(document_ids)

if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id])
filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
target_doc_ids = target_doc_ids & filtered_ids
if metadata_condition.get("conditions") and not target_doc_ids:
return get_result(data={"updated": 0, "matched_docs": 0})

target_doc_ids = list(target_doc_ids)
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})


DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed"
DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"

Expand Down
2 changes: 1 addition & 1 deletion docker/.env
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ GO_ADMIN_PORT=9383
API_PROXY_SCHEME=python # use pure python server deployment

# The RAGFlow Docker image to download. v0.22+ doesn't include embedding models.
RAGFLOW_IMAGE=infiniflow/ragflow:v0.25.0
RAGFLOW_IMAGE=infiniflow/ragflow:latest

# If you cannot download the RAGFlow Docker image:
# RAGFLOW_IMAGE=swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow:v0.25.0
Expand Down
10 changes: 10 additions & 0 deletions test/testcases/test_http_api/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,16 @@ def metadata_batch_update(auth, dataset_id, payload=None):
return res.json()


def update_documents_metadata(auth, dataset_id, payload=None):
"""New unified API for updating document metadata.

Uses PATCH method at /api/v1/datasets/{dataset_id}/documents/metadatas
"""
url = f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}/documents/metadatas"
res = requests.patch(url=url, headers=HEADERS, auth=auth, json=payload)
return res.json()


# CHAT COMPLETIONS AND RELATED QUESTIONS
def related_questions(auth, payload=None):
url = f"{HOST_ADDRESS}/api/{VERSION}/sessions/related_questions"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -388,95 +388,6 @@ def test_download_and_download_doc_errors(self, monkeypatch):
res = _run(module.download_doc("doc-1"))
assert res["filename"] == "doc.txt"

def test_metadata_batch_update(self, monkeypatch):
module = _load_doc_module(monkeypatch)
monkeypatch.setattr(module, "convert_conditions", lambda cond: cond)
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"selector": {}}))
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert "don't own the dataset" in res["message"]

monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"selector": [1]}))
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert res["message"] == "selector must be an object."

monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"selector": {}, "updates": {"k": "v"}, "deletes": []}))
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert res["message"] == "updates and deletes must be lists."

monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"selector": {"metadata_condition": [1]}, "updates": [], "deletes": []}),
)
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert res["message"] == "metadata_condition must be an object."

monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"selector": {"document_ids": "doc-1"}, "updates": [], "deletes": []}),
)
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert res["message"] == "document_ids must be a list."

monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"selector": {}, "updates": [{"key": ""}], "deletes": []}),
)
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert "Each update requires key and value." in res["message"]

monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"selector": {}, "updates": [], "deletes": [{"x": "y"}]}),
)
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert "Each delete requires key." in res["message"]

monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue(
{
"selector": {"document_ids": ["bad"], "metadata_condition": {"conditions": []}},
"updates": [{"key": "k", "value": "v"}],
"deletes": [],
}
),
)
monkeypatch.setattr(module.KnowledgebaseService, "list_documents_by_ids", lambda _ids: ["doc-1"])
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert "do not belong to dataset" in res["message"]

monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue(
{
"selector": {"document_ids": ["doc-1"], "metadata_condition": {"conditions": [{"f": "x"}]}},
"updates": [{"key": "k", "value": "v"}],
"deletes": [],
}
),
)
monkeypatch.setattr(module, "meta_filter", lambda *_args, **_kwargs: [])
monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [])
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert res["code"] == 0
assert res["data"]["updated"] == 0
assert res["data"]["matched_docs"] == 0

monkeypatch.setattr(module, "meta_filter", lambda *_args, **_kwargs: ["doc-1"])
monkeypatch.setattr(module.DocMetadataService, "batch_update_metadata", lambda *_args, **_kwargs: 1)
res = _run(module.metadata_batch_update.__wrapped__("ds-1", "tenant-1"))
assert res["code"] == 0
assert res["data"]["updated"] == 1
assert res["data"]["matched_docs"] == 1


def test_parse_branches(self, monkeypatch):
module = _load_doc_module(monkeypatch)
Expand Down
Loading
Loading