Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 0 additions & 41 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,26 +226,6 @@ async def metadata_update():
return get_json_result(data={"updated": updated, "matched_docs": len(document_ids)})


@manager.route("/update_metadata_setting", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id", "metadata")
async def update_metadata_setting():
req = await get_request_json()
if not DocumentService.accessible(req["doc_id"], current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)

e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")

DocumentService.update_parser_config(doc.id, {"metadata": req["metadata"]})
e, doc = DocumentService.get_by_id(doc.id)
if not e:
return get_data_error_result(message="Document not found!")

return get_json_result(data=doc.to_dict())


@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
# @login_required
def thumbnails():
Expand Down Expand Up @@ -335,27 +315,6 @@ async def change_status():
return get_json_result(data=result)


@manager.route("/rm", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id")
async def rm():
req = await get_request_json()
doc_ids = req["doc_id"]
if isinstance(doc_ids, str):
doc_ids = [doc_ids]

for doc_id in doc_ids:
if not DocumentService.accessible4deletion(doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)

errors = await thread_pool_exec(FileService.delete_docs, doc_ids, current_user.id)

if errors:
return get_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)

return get_json_result(data=True)


@manager.route("/run", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids", "run")
Expand Down
163 changes: 158 additions & 5 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,17 @@
from api.db import VALID_FILE_TYPES
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.document_service import DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.common.check_team_permission import check_kb_team_permission
from api.utils.api_utils import get_data_error_result, get_error_data_result, get_result, get_json_result, \
server_error_response, add_tenant_id_to_kwargs, get_request_json
server_error_response, add_tenant_id_to_kwargs, get_request_json, get_error_argument_result, check_duplicate_ids
from api.utils.validation_utils import (
UpdateDocumentReq, format_validation_error_message,
UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq,
)
from common.constants import RetCode
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import thread_pool_exec

@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821
@login_required
Expand Down Expand Up @@ -260,9 +263,7 @@ async def upload_document(dataset_id, tenant_id):
description: Processing status.
"""
from api.constants import FILE_NAME_LEN_LIMIT
from api.common.check_team_permission import check_kb_team_permission
from api.db.services.file_service import FileService
from common.misc_utils import thread_pool_exec

form = await request.form
files = await request.files
Expand Down Expand Up @@ -573,7 +574,7 @@ def _parse_doc_id_filter_with_metadata(req, kb_id):
- The metadata_condition uses operators like: =, !=, >, <, >=, <=, contains, not contains,
in, not in, start with, end with, empty, not empty.
- The metadata parameter performs exact matching where values are OR'd within the same key
and AND'd across different keys.
& AND'd across different keys.

Examples:
Simple metadata filter (exact match):
Expand Down Expand Up @@ -668,6 +669,90 @@ def _parse_doc_id_filter_with_metadata(req, kb_id):
return RetCode.SUCCESS, "", list(doc_ids_filter) if doc_ids_filter is not None else [], return_empty_metadata


@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def delete_documents(tenant_id, dataset_id):
"""
Delete documents from a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset containing the documents.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Document deletion parameters.
required: true
schema:
type: object
properties:
ids:
type: array or null
items:
type: string
description: |
Specifies the documents to delete:
- An array of IDs, only the specified documents will be deleted.
delete_all:
type: boolean
default: false
description: Whether to delete all documents in the dataset.
responses:
200:
description: Successful operation.
schema:
type: object
"""
req, err = await validate_and_parse_json_request(request, DeleteDocumentReq)
if err is not None or req is None:
return get_error_argument_result(err)

try:
# Validate dataset exists and user has permission
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")

# Get documents to delete
doc_ids = req.get("ids") or []
delete_all = req.get("delete_all", False)
if not delete_all and len(doc_ids) == 0:
return get_error_data_result(message=f"should either provide doc ids or set delete_all(true), dataset: {dataset_id}. ")

if len(doc_ids) > 0 and delete_all:
return get_error_data_result(message=f"should not provide both doc ids and delete_all(true), dataset: {dataset_id}. ")
if delete_all:
doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)]

# make sure each id is unique
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document")
if duplicate_messages:
logging.warning(f"duplicate_messages:{duplicate_messages}")
else:
doc_ids = unique_doc_ids
Comment on lines +744 to +749
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Inverted dedup logic: duplicates are kept, uniques are discarded.

When check_duplicate_ids reports duplicates, the code logs a warning and proceeds with the original (duplicated) doc_ids; only when there are no duplicates does it assign unique_doc_ids. The branches are reversed — on duplicates you should be using unique_doc_ids, not the unfiltered list.

In practice this is masked today because DeleteDocumentReq.validate_ids already rejects duplicates with a 101 error before reaching this block, making this code effectively dead. Either fix the inversion or drop the redundant check.

🛠️ Proposed fix
-        # make sure each id is unique
-        unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document")
-        if duplicate_messages:
-            logging.warning(f"duplicate_messages:{duplicate_messages}")
-        else:
-            doc_ids = unique_doc_ids
+        # make sure each id is unique
+        unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document")
+        if duplicate_messages:
+            logging.warning(f"duplicate_messages:{duplicate_messages}")
+        doc_ids = unique_doc_ids
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 738 - 743, The
deduplication branch is inverted: when check_duplicate_ids(doc_ids, "document")
returns duplicate_messages you should replace doc_ids with unique_doc_ids
instead of keeping the original duplicated list. Update the block that calls
check_duplicate_ids so that if duplicate_messages is truthy you assign doc_ids =
unique_doc_ids (and still log the warning), and if duplicate_messages is falsy
you leave doc_ids as-is; alternatively remove this redundant check entirely
since DeleteDocumentReq.validate_ids already rejects duplicates. Target the
check_duplicate_ids call and the surrounding assignment/branch to implement the
fix.


# Delete documents using existing FileService.delete_docs
errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id)

if errors:
return get_error_data_result(message=str(errors))

return get_result(data={"deleted": len(doc_ids)})
except Exception as e:
logging.exception(e)
return get_error_data_result(message="Internal server error")
Comment on lines +750 to +760
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Partial-failure semantics and observability.

FileService.delete_docs iterates sequentially and accumulates errors into a single string while still committing successful deletions. When errors is truthy the endpoint returns a generic error, but some documents may already be deleted — the client sees a failure without any indication of which IDs succeeded. Consider returning structured per-id results (e.g., {"deleted": [...], "failed": [...]}) so callers can reconcile state, and add an logging.info for the deletion flow as required by the repo guideline to add logging for new flows.

As per coding guidelines: "Add logging for new flows".


def _aggregate_filters(docs):
"""Aggregate filter options from a list of documents.

Expand Down Expand Up @@ -725,3 +810,71 @@ def _aggregate_filters(docs):
"run_status": run_status_counter,
"metadata": metadata_counter,
}

@manager.route("/datasets/<dataset_id>/documents/<document_id>/metadata/config",methods=["PUT"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def update_metadata_config(tenant_id, dataset_id, document_id):
"""
Update document metadata configuration.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: path
name: document_id
type: string
required: true
description: ID of the document.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Metadata configuration.
required: true
schema:
type: object
properties:
metadata:
type: object
description: Metadata configuration JSON.
responses:
200:
description: Document updated successfully.
"""
# Verify ownership and existence of dataset
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(message="You don't own the dataset.")

# Verify document exists in the dataset
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
if not doc:
return get_error_data_result(
message=f"Document {document_id} not found in dataset {dataset_id}"
)
doc = doc[0]

# Get request body
req = await get_request_json()
if "metadata" not in req:
return get_error_argument_result(message="metadata is required")
Comment on lines +862 to +877
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Permission check inconsistency and missing null guard on request body.

Two concerns in this handler:

  1. Ownership is checked via KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id), but other handlers in this file (e.g., list_docs, metadata_summary, delete_documents) use KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id). The query(...) variant only matches datasets whose tenant_id column equals the caller's tenant, so team-shared datasets that are reachable via accessible() will be rejected here with "You don't own the dataset." Please align with the rest of the file unless the stricter check is intentional.
  2. req = await get_request_json() followed by if "metadata" not in req will raise TypeError if the body is empty / not a JSON object (returns None). Guard against that before the key check.
🛠️ Suggested changes
-    # Verify ownership and existence of dataset
-    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
-        return get_error_data_result(message="You don't own the dataset.")
+    # Verify ownership and existence of dataset
+    if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
+        return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
@@
-    # Get request body
-    req = await get_request_json()
-    if "metadata" not in req:
-        return get_error_argument_result(message="metadata is required")
+    # Get request body
+    req = await get_request_json() or {}
+    if not isinstance(req, dict) or "metadata" not in req:
+        return get_error_argument_result(message="metadata is required")
#!/bin/bash
# Confirm which permission helper is used across restful_apis handlers.
rg -nP "KnowledgebaseService\.(accessible|query)\(" --type=py -g '!**/test/**'
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 855 - 870, Replace the
ownership check that calls KnowledgebaseService.query(...) with the same
permission helper used elsewhere:
KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id) so
team-shared datasets are allowed; then, after calling req = await
get_request_json(), guard against a None/non-dict body (e.g., if not req or not
isinstance(req, dict): return get_error_argument_result(message="metadata is
required")) before checking "metadata" in req; keep the existing
DocumentService.query(id=document_id, kb_id=dataset_id) and doc = doc[0] logic
intact.


# Update parser config with metadata
DocumentService.update_parser_config(doc.id, {"metadata": req["metadata"]})

# Get updated document
e, doc = DocumentService.get_by_id(doc.id)
if not e:
return get_data_error_result(message="Document not found!")

return get_result(data=doc.to_dict())
Comment on lines +820 to +887
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Add logging and error handling to the new metadata-config flow.

This new handler has no logging calls and no try/except around DocumentService.update_parser_config, which can raise (e.g., LookupError from the underlying service). An unhandled exception here will surface as a 500 with no structured log context. Please add info/warning/error logs along the happy path and failure branches, and wrap the service call in a try/except consistent with update_document / delete_documents in this file.

As per coding guidelines: "Add logging for new flows".

Loading
Loading