-
Notifications
You must be signed in to change notification settings - Fork 8.9k
Refactor: Consolidation WEB API & HTTP API for document delete api #14254
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,14 +27,17 @@ | |
| from api.db import VALID_FILE_TYPES | ||
| from api.db.services.doc_metadata_service import DocMetadataService | ||
| from api.db.services.document_service import DocumentService | ||
| from api.db.services.file_service import FileService | ||
| from api.db.services.knowledgebase_service import KnowledgebaseService | ||
| from api.common.check_team_permission import check_kb_team_permission | ||
| from api.utils.api_utils import get_data_error_result, get_error_data_result, get_result, get_json_result, \ | ||
| server_error_response, add_tenant_id_to_kwargs, get_request_json | ||
| server_error_response, add_tenant_id_to_kwargs, get_request_json, get_error_argument_result, check_duplicate_ids | ||
| from api.utils.validation_utils import ( | ||
| UpdateDocumentReq, format_validation_error_message, | ||
| UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq, | ||
| ) | ||
| from common.constants import RetCode | ||
| from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema | ||
| from common.misc_utils import thread_pool_exec | ||
|
|
||
| @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821 | ||
| @login_required | ||
|
|
@@ -260,9 +263,7 @@ async def upload_document(dataset_id, tenant_id): | |
| description: Processing status. | ||
| """ | ||
| from api.constants import FILE_NAME_LEN_LIMIT | ||
| from api.common.check_team_permission import check_kb_team_permission | ||
| from api.db.services.file_service import FileService | ||
| from common.misc_utils import thread_pool_exec | ||
|
|
||
| form = await request.form | ||
| files = await request.files | ||
|
|
@@ -660,3 +661,88 @@ def _parse_doc_id_filter_with_metadata(req, kb_id): | |
| return RetCode.SUCCESS, "", [], return_empty_metadata | ||
|
|
||
| return RetCode.SUCCESS, "", list(doc_ids_filter) if doc_ids_filter is not None else [], return_empty_metadata | ||
|
|
||
|
|
||
| @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821 | ||
| @login_required | ||
| @add_tenant_id_to_kwargs | ||
| async def delete_documents(tenant_id, dataset_id): | ||
| """ | ||
| Delete documents from a dataset. | ||
| --- | ||
| tags: | ||
| - Documents | ||
| security: | ||
| - ApiKeyAuth: [] | ||
| parameters: | ||
| - in: path | ||
| name: dataset_id | ||
| type: string | ||
| required: true | ||
| description: ID of the dataset containing the documents. | ||
| - in: header | ||
| name: Authorization | ||
| type: string | ||
| required: true | ||
| description: Bearer token for authentication. | ||
| - in: body | ||
| name: body | ||
| description: Document deletion parameters. | ||
| required: true | ||
| schema: | ||
| type: object | ||
| properties: | ||
| ids: | ||
| type: array or null | ||
| items: | ||
| type: string | ||
| description: | | ||
| Specifies the documents to delete: | ||
| - An array of IDs, only the specified documents will be deleted. | ||
| delete_all: | ||
| type: boolean | ||
| default: false | ||
| description: Whether to delete all documents in the dataset. | ||
| responses: | ||
| 200: | ||
| description: Successful operation. | ||
| schema: | ||
| type: object | ||
| """ | ||
| req, err = await validate_and_parse_json_request(request, DeleteDocumentReq) | ||
| if err is not None or req is None: | ||
| return get_error_argument_result(err) | ||
|
|
||
| try: | ||
| # Validate dataset exists and user has permission | ||
| if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): | ||
| return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") | ||
|
|
||
| # Get documents to delete | ||
| doc_ids = req.get("ids") or [] | ||
| delete_all = req.get("delete_all", False) | ||
| if not delete_all and len(doc_ids) == 0: | ||
| return get_error_data_result(message=f"should either provide doc ids or set delete_all(true), dataset: {dataset_id}. ") | ||
|
|
||
| if len(doc_ids) > 0 and delete_all: | ||
| return get_error_data_result(message=f"should not provide both doc ids and delete_all(true), dataset: {dataset_id}. ") | ||
| if delete_all: | ||
| doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)] | ||
|
|
||
| # make sure each id is unique | ||
| unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document") | ||
| if duplicate_messages: | ||
| logging.warning(f"duplicate_messages:{duplicate_messages}") | ||
| else: | ||
| doc_ids = unique_doc_ids | ||
|
|
||
| # Delete documents using existing FileService.delete_docs | ||
| errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id) | ||
|
|
||
| if errors: | ||
| return get_error_data_result(message=str(errors)) | ||
|
|
||
| return get_result(data={"deleted": len(doc_ids)}) | ||
|
Comment on lines
+746
to
+751
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Partial-failure reporting loses successful deletions.
🤖 Prompt for AI Agents |
||
| except Exception as e: | ||
| logging.exception(e) | ||
| return get_error_data_result(message="Internal server error") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Preflight document IDs before deleting.
This route checks access to
dataset_id, but then passes arbitrarydoc_idstoFileService.delete_docs, which deletes by document ID and resolves the document tenant internally. That allows a request scoped to one dataset to delete a document from another dataset if its ID is supplied. Also, duplicate IDs are only logged, so duplicate requests can partially delete and then fail.Reject duplicates and verify every requested ID belongs to
dataset_idbefore callingFileService.delete_docs.🛡️ Proposed fix
if delete_all: doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)] # make sure each id is unique unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document") if duplicate_messages: - logging.warning(f"duplicate_messages:{duplicate_messages}") - else: - doc_ids = unique_doc_ids + logging.warning(f"duplicate_messages:{duplicate_messages}") + return get_error_data_result(message="; ".join(duplicate_messages), code=RetCode.ARGUMENT_ERROR) + doc_ids = unique_doc_ids + + dataset_doc_ids = set(KnowledgebaseService.list_documents_by_ids([dataset_id])) + missing_doc_ids = [doc_id for doc_id in doc_ids if doc_id not in dataset_doc_ids] + if missing_doc_ids: + return get_error_data_result(message=f"Document not found: {missing_doc_ids[0]}") # Delete documents using existing FileService.delete_docs errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id)🤖 Prompt for AI Agents