Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,27 +374,6 @@ async def change_status():
return get_json_result(data=result)


@manager.route("/rm", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id")
async def rm():
req = await get_request_json()
doc_ids = req["doc_id"]
if isinstance(doc_ids, str):
doc_ids = [doc_ids]

for doc_id in doc_ids:
if not DocumentService.accessible4deletion(doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)

errors = await thread_pool_exec(FileService.delete_docs, doc_ids, current_user.id)

if errors:
return get_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)

return get_json_result(data=True)


@manager.route("/run", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids", "run")
Expand Down
94 changes: 90 additions & 4 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,17 @@
from api.db import VALID_FILE_TYPES
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.document_service import DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.common.check_team_permission import check_kb_team_permission
from api.utils.api_utils import get_data_error_result, get_error_data_result, get_result, get_json_result, \
server_error_response, add_tenant_id_to_kwargs, get_request_json
server_error_response, add_tenant_id_to_kwargs, get_request_json, get_error_argument_result, check_duplicate_ids
from api.utils.validation_utils import (
UpdateDocumentReq, format_validation_error_message,
UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq,
)
from common.constants import RetCode
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import thread_pool_exec

@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821
@login_required
Expand Down Expand Up @@ -260,9 +263,7 @@ async def upload_document(dataset_id, tenant_id):
description: Processing status.
"""
from api.constants import FILE_NAME_LEN_LIMIT
from api.common.check_team_permission import check_kb_team_permission
from api.db.services.file_service import FileService
from common.misc_utils import thread_pool_exec

form = await request.form
files = await request.files
Expand Down Expand Up @@ -660,3 +661,88 @@ def _parse_doc_id_filter_with_metadata(req, kb_id):
return RetCode.SUCCESS, "", [], return_empty_metadata

return RetCode.SUCCESS, "", list(doc_ids_filter) if doc_ids_filter is not None else [], return_empty_metadata


@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def delete_documents(tenant_id, dataset_id):
"""
Delete documents from a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset containing the documents.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Document deletion parameters.
required: true
schema:
type: object
properties:
ids:
type: array or null
items:
type: string
description: |
Specifies the documents to delete:
- An array of IDs, only the specified documents will be deleted.
delete_all:
type: boolean
default: false
description: Whether to delete all documents in the dataset.
responses:
200:
description: Successful operation.
schema:
type: object
"""
req, err = await validate_and_parse_json_request(request, DeleteDocumentReq)
if err is not None or req is None:
return get_error_argument_result(err)

try:
# Validate dataset exists and user has permission
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")

# Get documents to delete
doc_ids = req.get("ids") or []
delete_all = req.get("delete_all", False)
if not delete_all and len(doc_ids) == 0:
return get_error_data_result(message=f"should either provide doc ids or set delete_all(true), dataset: {dataset_id}. ")

if len(doc_ids) > 0 and delete_all:
return get_error_data_result(message=f"should not provide both doc ids and delete_all(true), dataset: {dataset_id}. ")
if delete_all:
doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)]

# make sure each id is unique
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document")
if duplicate_messages:
logging.warning(f"duplicate_messages:{duplicate_messages}")
else:
doc_ids = unique_doc_ids

# Delete documents using existing FileService.delete_docs
errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id)
Comment on lines +727 to +746
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Preflight document IDs before deleting.

This route checks access to dataset_id, but then passes arbitrary doc_ids to FileService.delete_docs, which deletes by document ID and resolves the document tenant internally. That allows a request scoped to one dataset to delete a document from another dataset if its ID is supplied. Also, duplicate IDs are only logged, so duplicate requests can partially delete and then fail.

Reject duplicates and verify every requested ID belongs to dataset_id before calling FileService.delete_docs.

🛡️ Proposed fix
         if delete_all:
             doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)]
 
         # make sure each id is unique
         unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document")
         if duplicate_messages:
-            logging.warning(f"duplicate_messages:{duplicate_messages}")
-        else:
-            doc_ids = unique_doc_ids
+            logging.warning(f"duplicate_messages:{duplicate_messages}")
+            return get_error_data_result(message="; ".join(duplicate_messages), code=RetCode.ARGUMENT_ERROR)
+        doc_ids = unique_doc_ids
+
+        dataset_doc_ids = set(KnowledgebaseService.list_documents_by_ids([dataset_id]))
+        missing_doc_ids = [doc_id for doc_id in doc_ids if doc_id not in dataset_doc_ids]
+        if missing_doc_ids:
+            return get_error_data_result(message=f"Document not found: {missing_doc_ids[0]}")
 
         # Delete documents using existing FileService.delete_docs
         errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 721 - 740, Reject
duplicate IDs and verify ownership before calling FileService.delete_docs: if
check_duplicate_ids(doc_ids, "document") returns any duplicates, return an error
instead of only logging; then preflight all doc_ids by fetching their records
(e.g., via DocumentService.query or a DocumentService.get_by_ids helper) and
ensure each returned document belongs to the requested dataset_id (and tenant_id
if applicable); if any requested id is missing or belongs to a different
dataset/tenant, return an error listing offending ids; only after duplicates are
absent and all ids are verified to belong to dataset_id call
FileService.delete_docs(doc_ids, tenant_id).


if errors:
return get_error_data_result(message=str(errors))

return get_result(data={"deleted": len(doc_ids)})
Comment on lines +746 to +751
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Partial-failure reporting loses successful deletions.

FileService.delete_docs iterates IDs and concatenates exception messages into a single string without stopping, so on partial failure it has already deleted some docs. Here any non-empty errors causes a generic error response and len(doc_ids) is never returned, so the caller cannot tell how many documents were actually removed. Consider (a) having delete_docs return (deleted_ids, errors) or (b) at minimum, including the attempted/failed counts in the error payload so clients can reconcile state.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 746 - 751,
FileService.delete_docs currently swallows partial successes by concatenating
exception messages and returning a non-empty errors string so the API never
reports how many docs were actually deleted; change FileService.delete_docs to
return structured results (e.g., (deleted_ids, errors) or (deleted_count,
errors_list)) and update the callsite in document_api.py (where
thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id) is invoked) to
unpack that tuple, return get_result with the deleted count when any docs were
removed, and when returning get_error_data_result include both the failed error
details and the attempted/failed counts so clients can reconcile partial
failures instead of only receiving a generic error.

except Exception as e:
logging.exception(e)
return get_error_data_result(message="Internal server error")
119 changes: 2 additions & 117 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,19 @@
from pydantic import BaseModel, Field, validator
from quart import request, send_file

from api.db.db_models import APIToken, Document, File, Task
from api.db.db_models import APIToken, Document, Task
from api.db.joint_services.tenant_model_service import get_model_config_by_id, get_model_config_by_type_and_name, get_tenant_default_model_by_type
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
from api.db.services.tenant_llm_service import TenantLLMService
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required
from api.utils.image_utils import store_chunk_image
from common import settings
from common.constants import FileSource, LLMType, ParserType, RetCode, TaskStatus
from common.constants import LLMType, ParserType, RetCode, TaskStatus
from common.metadata_utils import convert_conditions, meta_filter
from common.misc_utils import thread_pool_exec
from common.string_utils import is_content_empty, remove_redundant_spaces
Expand Down Expand Up @@ -209,120 +208,6 @@ async def metadata_batch_update(dataset_id, tenant_id):
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})


@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
@token_required
async def delete(tenant_id, dataset_id):
"""
Delete documents from a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: body
name: body
description: Document deletion parameters.
required: true
schema:
type: object
properties:
ids:
type: array
items:
type: string
description: |
List of document IDs to delete.
If omitted, `null`, or an empty array is provided, no documents will be deleted.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
responses:
200:
description: Documents deleted successfully.
schema:
type: object
"""
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
req = await get_request_json()
if not req:
return get_result()

doc_ids = req.get("ids")
if not doc_ids:
if req.get("delete_all") is True:
doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)]
if not doc_ids:
return get_result()
else:
return get_result()

doc_list = doc_ids

unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document")
doc_list = unique_doc_ids

root_folder = FileService.get_root_folder(tenant_id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, tenant_id)
errors = ""
not_found = []
success_count = 0
for doc_id in doc_list:
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
not_found.append(doc_id)
continue
tenant_id = DocumentService.get_tenant_id(doc_id)
if not tenant_id:
return get_error_data_result(message="Tenant not found!")

b, n = File2DocumentService.get_storage_address(doc_id=doc_id)

if not DocumentService.remove_document(doc, tenant_id):
return get_error_data_result(message="Database error (Document removal)!")

f2d = File2DocumentService.get_by_document_id(doc_id)
FileService.filter_delete(
[
File.source_type == FileSource.KNOWLEDGEBASE,
File.id == f2d[0].file_id,
]
)
File2DocumentService.delete_by_document_id(doc_id)

settings.STORAGE_IMPL.rm(b, n)
success_count += 1
except Exception as e:
errors += str(e)

if not_found:
return get_result(message=f"Documents not found: {not_found}", code=RetCode.DATA_ERROR)

if errors:
return get_result(message=errors, code=RetCode.SERVER_ERROR)

if duplicate_messages:
if success_count > 0:
return get_result(
message=f"Partially deleted {success_count} datasets with {len(duplicate_messages)} errors",
data={"success_count": success_count, "errors": duplicate_messages},
)
else:
return get_error_data_result(message=";".join(duplicate_messages))

return get_result()


DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed"
DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"

Expand Down
3 changes: 3 additions & 0 deletions api/utils/validation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,9 @@ def validate_ids(cls, v_list: list[str] | None) -> list[str] | None:
class DeleteDatasetReq(DeleteReq): ...


class DeleteDocumentReq(DeleteReq): ...


class BaseListReq(BaseModel):
model_config = ConfigDict(extra="forbid")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ class TestAuthorization:
@pytest.mark.parametrize(
"invalid_auth, expected_code, expected_message",
[
(None, 0, "`Authorization` can't be empty"),
(None, 401, "<Unauthorized '401: Unauthorized'>"),
(
RAGFlowHttpApiAuth(INVALID_API_TOKEN),
109,
"Authentication error: API key is invalid!",
401,
"<Unauthorized '401: Unauthorized'>",
),
],
)
Expand All @@ -45,19 +45,19 @@ class TestDocumentsDeletion:
@pytest.mark.parametrize(
"payload, expected_code, expected_message, remaining",
[
(None, 0, "", 3),
({"ids": []}, 0, "", 3),
({"ids": ["invalid_id"]}, 102, "Documents not found: ['invalid_id']", 3),
({}, 102, "should either provide doc ids or set delete_all(true), dataset", 3),
({"ids": []}, 102, "should either provide doc ids or set delete_all(true), dataset", 3),
({"ids": ["invalid_id"]}, 101, "Field: <ids> - Message: <Invalid UUID1 format> - Value: <['invalid_id']>", 3),
(
{"ids": ["\n!?。;!?\"'"]},
102,
"""Documents not found: [\'\\n!?。;!?"\\\'\']""",
101,
"Field: <ids> - Message: <Invalid UUID1 format> - Value:",
3,
),
(
"not json",
100,
"AttributeError(\"'str' object has no attribute 'get'\")",
101,
"Invalid request payload: expected object, got str",
3,
),
(lambda r: {"ids": r[:1]}, 0, "", 2),
Expand All @@ -79,7 +79,7 @@ def test_basic_scenarios(
res = delete_documents(HttpApiAuth, dataset_id, payload)
assert res["code"] == expected_code
if res["code"] != 0:
assert res["message"] == expected_message
assert expected_message in res["message"]

res = list_documents(HttpApiAuth, dataset_id)
assert len(res["data"]["docs"]) == remaining
Expand Down Expand Up @@ -117,12 +117,12 @@ def test_delete_partial_invalid_id(self, HttpApiAuth, add_documents_func, payloa
if callable(payload):
payload = payload(document_ids)
res = delete_documents(HttpApiAuth, dataset_id, payload)
assert res["code"] == 102
assert res["message"] == "Documents not found: ['invalid_id']"
assert res["code"] == 101
assert "Field: <ids> - Message: <Invalid UUID1 format> - Value" in res["message"]

res = list_documents(HttpApiAuth, dataset_id)
assert len(res["data"]["docs"]) == 0
assert res["data"]["total"] == 0
assert len(res["data"]["docs"]) == 3
assert res["data"]["total"] == 3

@pytest.mark.p2
def test_repeated_deletion(self, HttpApiAuth, add_documents_func):
Expand All @@ -132,19 +132,18 @@ def test_repeated_deletion(self, HttpApiAuth, add_documents_func):

res = delete_documents(HttpApiAuth, dataset_id, {"ids": document_ids})
assert res["code"] == 102
assert "Documents not found" in res["message"]
assert "Document not found" in res["message"]

@pytest.mark.p2
def test_duplicate_deletion(self, HttpApiAuth, add_documents_func):
dataset_id, document_ids = add_documents_func
res = delete_documents(HttpApiAuth, dataset_id, {"ids": document_ids + document_ids})
assert res["code"] == 0
assert "Duplicate document ids" in res["data"]["errors"][0]
assert res["data"]["success_count"] == 3
assert res["code"] == 101, res
assert "Field: <ids> - Message: <Duplicate ids:" in res["message"]

res = list_documents(HttpApiAuth, dataset_id)
assert len(res["data"]["docs"]) == 0
assert res["data"]["total"] == 0
assert len(res["data"]["docs"]) == 3
assert res["data"]["total"] == 3


@pytest.mark.p3
Expand Down
Loading
Loading