Skip to content

Commit 3d8a82c

Browse files
authored
Refactor: Consolidation WEB API & HTTP API for document delete api (infiniflow#14254)
### What problem does this PR solve? Before consolidation Web API: POST /v1/document/rm Http API - DELETE /api/v1/datasets/<dataset_id>/documents After consolidation, Restful API -- DELETE /api/v1/datasets/<dataset_id>/documents ### Type of change - [x] Refactoring
1 parent 6baf74a commit 3d8a82c

15 files changed

Lines changed: 178 additions & 278 deletions

File tree

api/apps/document_app.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -319,27 +319,6 @@ async def change_status():
319319
return get_json_result(data=result)
320320

321321

322-
@manager.route("/rm", methods=["POST"]) # noqa: F821
323-
@login_required
324-
@validate_request("doc_id")
325-
async def rm():
326-
req = await get_request_json()
327-
doc_ids = req["doc_id"]
328-
if isinstance(doc_ids, str):
329-
doc_ids = [doc_ids]
330-
331-
for doc_id in doc_ids:
332-
if not DocumentService.accessible4deletion(doc_id, current_user.id):
333-
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
334-
335-
errors = await thread_pool_exec(FileService.delete_docs, doc_ids, current_user.id)
336-
337-
if errors:
338-
return get_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
339-
340-
return get_json_result(data=True)
341-
342-
343322
@manager.route("/run", methods=["POST"]) # noqa: F821
344323
@login_required
345324
@validate_request("doc_ids", "run")

api/apps/restful_apis/document_api.py

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,17 @@
2727
from api.db import VALID_FILE_TYPES
2828
from api.db.services.doc_metadata_service import DocMetadataService
2929
from api.db.services.document_service import DocumentService
30+
from api.db.services.file_service import FileService
3031
from api.db.services.knowledgebase_service import KnowledgebaseService
32+
from api.common.check_team_permission import check_kb_team_permission
3133
from api.utils.api_utils import get_data_error_result, get_error_data_result, get_result, get_json_result, \
32-
server_error_response, add_tenant_id_to_kwargs, get_request_json
34+
server_error_response, add_tenant_id_to_kwargs, get_request_json, get_error_argument_result, check_duplicate_ids
3335
from api.utils.validation_utils import (
34-
UpdateDocumentReq, format_validation_error_message,
36+
UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq,
3537
)
3638
from common.constants import RetCode
3739
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
40+
from common.misc_utils import thread_pool_exec
3841

3942
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821
4043
@login_required
@@ -260,9 +263,7 @@ async def upload_document(dataset_id, tenant_id):
260263
description: Processing status.
261264
"""
262265
from api.constants import FILE_NAME_LEN_LIMIT
263-
from api.common.check_team_permission import check_kb_team_permission
264266
from api.db.services.file_service import FileService
265-
from common.misc_utils import thread_pool_exec
266267

267268
form = await request.form
268269
files = await request.files
@@ -674,6 +675,89 @@ def _parse_doc_id_filter_with_metadata(req, kb_id):
674675
return RetCode.SUCCESS, "", list(doc_ids_filter) if doc_ids_filter is not None else [], return_empty_metadata
675676

676677

678+
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
679+
@login_required
680+
@add_tenant_id_to_kwargs
681+
async def delete_documents(tenant_id, dataset_id):
682+
"""
683+
Delete documents from a dataset.
684+
---
685+
tags:
686+
- Documents
687+
security:
688+
- ApiKeyAuth: []
689+
parameters:
690+
- in: path
691+
name: dataset_id
692+
type: string
693+
required: true
694+
description: ID of the dataset containing the documents.
695+
- in: header
696+
name: Authorization
697+
type: string
698+
required: true
699+
description: Bearer token for authentication.
700+
- in: body
701+
name: body
702+
description: Document deletion parameters.
703+
required: true
704+
schema:
705+
type: object
706+
properties:
707+
ids:
708+
type: array or null
709+
items:
710+
type: string
711+
description: |
712+
Specifies the documents to delete:
713+
- An array of IDs, only the specified documents will be deleted.
714+
delete_all:
715+
type: boolean
716+
default: false
717+
description: Whether to delete all documents in the dataset.
718+
responses:
719+
200:
720+
description: Successful operation.
721+
schema:
722+
type: object
723+
"""
724+
req, err = await validate_and_parse_json_request(request, DeleteDocumentReq)
725+
if err is not None or req is None:
726+
return get_error_argument_result(err)
727+
728+
try:
729+
# Validate dataset exists and user has permission
730+
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
731+
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
732+
733+
# Get documents to delete
734+
doc_ids = req.get("ids") or []
735+
delete_all = req.get("delete_all", False)
736+
if not delete_all and len(doc_ids) == 0:
737+
return get_error_data_result(message=f"should either provide doc ids or set delete_all(true), dataset: {dataset_id}. ")
738+
739+
if len(doc_ids) > 0 and delete_all:
740+
return get_error_data_result(message=f"should not provide both doc ids and delete_all(true), dataset: {dataset_id}. ")
741+
if delete_all:
742+
doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)]
743+
744+
# make sure each id is unique
745+
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document")
746+
if duplicate_messages:
747+
logging.warning(f"duplicate_messages:{duplicate_messages}")
748+
else:
749+
doc_ids = unique_doc_ids
750+
751+
# Delete documents using existing FileService.delete_docs
752+
errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id)
753+
754+
if errors:
755+
return get_error_data_result(message=str(errors))
756+
757+
return get_result(data={"deleted": len(doc_ids)})
758+
except Exception as e:
759+
logging.exception(e)
760+
return get_error_data_result(message="Internal server error")
677761
def _aggregate_filters(docs):
678762
"""Aggregate filter options from a list of documents.
679763

api/apps/sdk/doc.py

Lines changed: 2 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,19 @@
2121
from pydantic import BaseModel, Field, validator
2222
from quart import request, send_file
2323

24-
from api.db.db_models import APIToken, Document, File, Task
24+
from api.db.db_models import APIToken, Document, Task
2525
from api.db.joint_services.tenant_model_service import get_model_config_by_id, get_model_config_by_type_and_name, get_tenant_default_model_by_type
2626
from api.db.services.doc_metadata_service import DocMetadataService
2727
from api.db.services.document_service import DocumentService
2828
from api.db.services.file2document_service import File2DocumentService
29-
from api.db.services.file_service import FileService
3029
from api.db.services.knowledgebase_service import KnowledgebaseService
3130
from api.db.services.llm_service import LLMBundle
3231
from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
3332
from api.db.services.tenant_llm_service import TenantLLMService
3433
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required
3534
from api.utils.image_utils import store_chunk_image
3635
from common import settings
37-
from common.constants import FileSource, LLMType, ParserType, RetCode, TaskStatus
36+
from common.constants import LLMType, ParserType, RetCode, TaskStatus
3837
from common.metadata_utils import convert_conditions, meta_filter
3938
from common.misc_utils import thread_pool_exec
4039
from common.string_utils import is_content_empty, remove_redundant_spaces
@@ -209,120 +208,6 @@ async def metadata_batch_update(dataset_id, tenant_id):
209208
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})
210209

211210

212-
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
213-
@token_required
214-
async def delete(tenant_id, dataset_id):
215-
"""
216-
Delete documents from a dataset.
217-
---
218-
tags:
219-
- Documents
220-
security:
221-
- ApiKeyAuth: []
222-
parameters:
223-
- in: path
224-
name: dataset_id
225-
type: string
226-
required: true
227-
description: ID of the dataset.
228-
- in: body
229-
name: body
230-
description: Document deletion parameters.
231-
required: true
232-
schema:
233-
type: object
234-
properties:
235-
ids:
236-
type: array
237-
items:
238-
type: string
239-
description: |
240-
List of document IDs to delete.
241-
If omitted, `null`, or an empty array is provided, no documents will be deleted.
242-
- in: header
243-
name: Authorization
244-
type: string
245-
required: true
246-
description: Bearer token for authentication.
247-
responses:
248-
200:
249-
description: Documents deleted successfully.
250-
schema:
251-
type: object
252-
"""
253-
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
254-
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
255-
req = await get_request_json()
256-
if not req:
257-
return get_result()
258-
259-
doc_ids = req.get("ids")
260-
if not doc_ids:
261-
if req.get("delete_all") is True:
262-
doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)]
263-
if not doc_ids:
264-
return get_result()
265-
else:
266-
return get_result()
267-
268-
doc_list = doc_ids
269-
270-
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document")
271-
doc_list = unique_doc_ids
272-
273-
root_folder = FileService.get_root_folder(tenant_id)
274-
pf_id = root_folder["id"]
275-
FileService.init_knowledgebase_docs(pf_id, tenant_id)
276-
errors = ""
277-
not_found = []
278-
success_count = 0
279-
for doc_id in doc_list:
280-
try:
281-
e, doc = DocumentService.get_by_id(doc_id)
282-
if not e:
283-
not_found.append(doc_id)
284-
continue
285-
tenant_id = DocumentService.get_tenant_id(doc_id)
286-
if not tenant_id:
287-
return get_error_data_result(message="Tenant not found!")
288-
289-
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
290-
291-
if not DocumentService.remove_document(doc, tenant_id):
292-
return get_error_data_result(message="Database error (Document removal)!")
293-
294-
f2d = File2DocumentService.get_by_document_id(doc_id)
295-
FileService.filter_delete(
296-
[
297-
File.source_type == FileSource.KNOWLEDGEBASE,
298-
File.id == f2d[0].file_id,
299-
]
300-
)
301-
File2DocumentService.delete_by_document_id(doc_id)
302-
303-
settings.STORAGE_IMPL.rm(b, n)
304-
success_count += 1
305-
except Exception as e:
306-
errors += str(e)
307-
308-
if not_found:
309-
return get_result(message=f"Documents not found: {not_found}", code=RetCode.DATA_ERROR)
310-
311-
if errors:
312-
return get_result(message=errors, code=RetCode.SERVER_ERROR)
313-
314-
if duplicate_messages:
315-
if success_count > 0:
316-
return get_result(
317-
message=f"Partially deleted {success_count} datasets with {len(duplicate_messages)} errors",
318-
data={"success_count": success_count, "errors": duplicate_messages},
319-
)
320-
else:
321-
return get_error_data_result(message=";".join(duplicate_messages))
322-
323-
return get_result()
324-
325-
326211
DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed"
327212
DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"
328213

api/utils/validation_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,9 @@ def validate_ids(cls, v_list: list[str] | None) -> list[str] | None:
818818
class DeleteDatasetReq(DeleteReq): ...
819819

820820

821+
class DeleteDocumentReq(DeleteReq): ...
822+
823+
821824
class BaseListReq(BaseModel):
822825
model_config = ConfigDict(extra="forbid")
823826

test/testcases/test_http_api/test_file_management_within_dataset/test_delete_documents.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ class TestAuthorization:
2626
@pytest.mark.parametrize(
2727
"invalid_auth, expected_code, expected_message",
2828
[
29-
(None, 0, "`Authorization` can't be empty"),
29+
(None, 401, "<Unauthorized '401: Unauthorized'>"),
3030
(
3131
RAGFlowHttpApiAuth(INVALID_API_TOKEN),
32-
109,
33-
"Authentication error: API key is invalid!",
32+
401,
33+
"<Unauthorized '401: Unauthorized'>",
3434
),
3535
],
3636
)
@@ -45,19 +45,19 @@ class TestDocumentsDeletion:
4545
@pytest.mark.parametrize(
4646
"payload, expected_code, expected_message, remaining",
4747
[
48-
(None, 0, "", 3),
49-
({"ids": []}, 0, "", 3),
50-
({"ids": ["invalid_id"]}, 102, "Documents not found: ['invalid_id']", 3),
48+
({}, 102, "should either provide doc ids or set delete_all(true), dataset", 3),
49+
({"ids": []}, 102, "should either provide doc ids or set delete_all(true), dataset", 3),
50+
({"ids": ["invalid_id"]}, 101, "Field: <ids> - Message: <Invalid UUID1 format> - Value: <['invalid_id']>", 3),
5151
(
5252
{"ids": ["\n!?。;!?\"'"]},
53-
102,
54-
"""Documents not found: [\'\\n!?。;!?"\\\'\']""",
53+
101,
54+
"Field: <ids> - Message: <Invalid UUID1 format> - Value:",
5555
3,
5656
),
5757
(
5858
"not json",
59-
100,
60-
"AttributeError(\"'str' object has no attribute 'get'\")",
59+
101,
60+
"Invalid request payload: expected object, got str",
6161
3,
6262
),
6363
(lambda r: {"ids": r[:1]}, 0, "", 2),
@@ -79,7 +79,7 @@ def test_basic_scenarios(
7979
res = delete_documents(HttpApiAuth, dataset_id, payload)
8080
assert res["code"] == expected_code
8181
if res["code"] != 0:
82-
assert res["message"] == expected_message
82+
assert expected_message in res["message"]
8383

8484
res = list_documents(HttpApiAuth, dataset_id)
8585
assert len(res["data"]["docs"]) == remaining
@@ -117,12 +117,12 @@ def test_delete_partial_invalid_id(self, HttpApiAuth, add_documents_func, payloa
117117
if callable(payload):
118118
payload = payload(document_ids)
119119
res = delete_documents(HttpApiAuth, dataset_id, payload)
120-
assert res["code"] == 102
121-
assert res["message"] == "Documents not found: ['invalid_id']"
120+
assert res["code"] == 101
121+
assert "Field: <ids> - Message: <Invalid UUID1 format> - Value" in res["message"]
122122

123123
res = list_documents(HttpApiAuth, dataset_id)
124-
assert len(res["data"]["docs"]) == 0
125-
assert res["data"]["total"] == 0
124+
assert len(res["data"]["docs"]) == 3
125+
assert res["data"]["total"] == 3
126126

127127
@pytest.mark.p2
128128
def test_repeated_deletion(self, HttpApiAuth, add_documents_func):
@@ -132,19 +132,18 @@ def test_repeated_deletion(self, HttpApiAuth, add_documents_func):
132132

133133
res = delete_documents(HttpApiAuth, dataset_id, {"ids": document_ids})
134134
assert res["code"] == 102
135-
assert "Documents not found" in res["message"]
135+
assert "Document not found" in res["message"]
136136

137137
@pytest.mark.p2
138138
def test_duplicate_deletion(self, HttpApiAuth, add_documents_func):
139139
dataset_id, document_ids = add_documents_func
140140
res = delete_documents(HttpApiAuth, dataset_id, {"ids": document_ids + document_ids})
141-
assert res["code"] == 0
142-
assert "Duplicate document ids" in res["data"]["errors"][0]
143-
assert res["data"]["success_count"] == 3
141+
assert res["code"] == 101, res
142+
assert "Field: <ids> - Message: <Duplicate ids:" in res["message"]
144143

145144
res = list_documents(HttpApiAuth, dataset_id)
146-
assert len(res["data"]["docs"]) == 0
147-
assert res["data"]["total"] == 0
145+
assert len(res["data"]["docs"]) == 3
146+
assert res["data"]["total"] == 3
148147

149148

150149
@pytest.mark.p3

0 commit comments

Comments
 (0)