-
Notifications
You must be signed in to change notification settings - Fork 8.9k
WIP Migrate doc update metadata config #14273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
3dc56d6
a3a9eb6
e95be20
d5ef7e3
98df570
a8575e2
e15c1c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,14 +27,17 @@ | |
| from api.db import VALID_FILE_TYPES | ||
| from api.db.services.doc_metadata_service import DocMetadataService | ||
| from api.db.services.document_service import DocumentService | ||
| from api.db.services.file_service import FileService | ||
| from api.db.services.knowledgebase_service import KnowledgebaseService | ||
| from api.common.check_team_permission import check_kb_team_permission | ||
| from api.utils.api_utils import get_data_error_result, get_error_data_result, get_result, get_json_result, \ | ||
| server_error_response, add_tenant_id_to_kwargs, get_request_json | ||
| server_error_response, add_tenant_id_to_kwargs, get_request_json, get_error_argument_result, check_duplicate_ids | ||
| from api.utils.validation_utils import ( | ||
| UpdateDocumentReq, format_validation_error_message, | ||
| UpdateDocumentReq, format_validation_error_message, validate_and_parse_json_request, DeleteDocumentReq, | ||
| ) | ||
| from common.constants import RetCode | ||
| from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema | ||
| from common.misc_utils import thread_pool_exec | ||
|
|
||
| @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821 | ||
| @login_required | ||
|
|
@@ -260,9 +263,7 @@ async def upload_document(dataset_id, tenant_id): | |
| description: Processing status. | ||
| """ | ||
| from api.constants import FILE_NAME_LEN_LIMIT | ||
| from api.common.check_team_permission import check_kb_team_permission | ||
| from api.db.services.file_service import FileService | ||
| from common.misc_utils import thread_pool_exec | ||
|
|
||
| form = await request.form | ||
| files = await request.files | ||
|
|
@@ -573,7 +574,7 @@ def _parse_doc_id_filter_with_metadata(req, kb_id): | |
| - The metadata_condition uses operators like: =, !=, >, <, >=, <=, contains, not contains, | ||
| in, not in, start with, end with, empty, not empty. | ||
| - The metadata parameter performs exact matching where values are OR'd within the same key | ||
| and AND'd across different keys. | ||
| & AND'd across different keys. | ||
|
|
||
| Examples: | ||
| Simple metadata filter (exact match): | ||
|
|
@@ -668,6 +669,90 @@ def _parse_doc_id_filter_with_metadata(req, kb_id): | |
| return RetCode.SUCCESS, "", list(doc_ids_filter) if doc_ids_filter is not None else [], return_empty_metadata | ||
|
|
||
|
|
||
| @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821 | ||
| @login_required | ||
| @add_tenant_id_to_kwargs | ||
| async def delete_documents(tenant_id, dataset_id): | ||
| """ | ||
| Delete documents from a dataset. | ||
| --- | ||
| tags: | ||
| - Documents | ||
| security: | ||
| - ApiKeyAuth: [] | ||
| parameters: | ||
| - in: path | ||
| name: dataset_id | ||
| type: string | ||
| required: true | ||
| description: ID of the dataset containing the documents. | ||
| - in: header | ||
| name: Authorization | ||
| type: string | ||
| required: true | ||
| description: Bearer token for authentication. | ||
| - in: body | ||
| name: body | ||
| description: Document deletion parameters. | ||
| required: true | ||
| schema: | ||
| type: object | ||
| properties: | ||
| ids: | ||
| type: array or null | ||
| items: | ||
| type: string | ||
| description: | | ||
| Specifies the documents to delete: | ||
| - An array of IDs, only the specified documents will be deleted. | ||
| delete_all: | ||
| type: boolean | ||
| default: false | ||
| description: Whether to delete all documents in the dataset. | ||
| responses: | ||
| 200: | ||
| description: Successful operation. | ||
| schema: | ||
| type: object | ||
| """ | ||
| req, err = await validate_and_parse_json_request(request, DeleteDocumentReq) | ||
| if err is not None or req is None: | ||
| return get_error_argument_result(err) | ||
|
|
||
| try: | ||
| # Validate dataset exists and user has permission | ||
| if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): | ||
| return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") | ||
|
|
||
| # Get documents to delete | ||
| doc_ids = req.get("ids") or [] | ||
| delete_all = req.get("delete_all", False) | ||
| if not delete_all and len(doc_ids) == 0: | ||
| return get_error_data_result(message=f"should either provide doc ids or set delete_all(true), dataset: {dataset_id}. ") | ||
|
|
||
| if len(doc_ids) > 0 and delete_all: | ||
| return get_error_data_result(message=f"should not provide both doc ids and delete_all(true), dataset: {dataset_id}. ") | ||
| if delete_all: | ||
| doc_ids = [doc.id for doc in DocumentService.query(kb_id=dataset_id)] | ||
|
|
||
| # make sure each id is unique | ||
| unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_ids, "document") | ||
| if duplicate_messages: | ||
| logging.warning(f"duplicate_messages:{duplicate_messages}") | ||
| else: | ||
| doc_ids = unique_doc_ids | ||
|
|
||
| # Delete documents using existing FileService.delete_docs | ||
| errors = await thread_pool_exec(FileService.delete_docs, doc_ids, tenant_id) | ||
|
|
||
| if errors: | ||
| return get_error_data_result(message=str(errors)) | ||
|
|
||
| return get_result(data={"deleted": len(doc_ids)}) | ||
| except Exception as e: | ||
| logging.exception(e) | ||
| return get_error_data_result(message="Internal server error") | ||
|
Comment on lines
+750
to
+760
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Partial-failure semantics and observability.
As per coding guidelines: "Add logging for new flows". |
||
|
|
||
| def _aggregate_filters(docs): | ||
| """Aggregate filter options from a list of documents. | ||
|
|
||
|
|
@@ -725,3 +810,71 @@ def _aggregate_filters(docs): | |
| "run_status": run_status_counter, | ||
| "metadata": metadata_counter, | ||
| } | ||
|
|
||
| @manager.route("/datasets/<dataset_id>/documents/<document_id>/metadata/config",methods=["PUT"]) # noqa: F821 | ||
| @login_required | ||
| @add_tenant_id_to_kwargs | ||
| async def update_metadata_config(tenant_id, dataset_id, document_id): | ||
| """ | ||
| Update document metadata configuration. | ||
| --- | ||
| tags: | ||
| - Documents | ||
| security: | ||
| - ApiKeyAuth: [] | ||
| parameters: | ||
| - in: path | ||
| name: dataset_id | ||
| type: string | ||
| required: true | ||
| description: ID of the dataset. | ||
| - in: path | ||
| name: document_id | ||
| type: string | ||
| required: true | ||
| description: ID of the document. | ||
| - in: header | ||
| name: Authorization | ||
| type: string | ||
| required: true | ||
| description: Bearer token for authentication. | ||
| - in: body | ||
| name: body | ||
| description: Metadata configuration. | ||
| required: true | ||
| schema: | ||
| type: object | ||
| properties: | ||
| metadata: | ||
| type: object | ||
| description: Metadata configuration JSON. | ||
| responses: | ||
| 200: | ||
| description: Document updated successfully. | ||
| """ | ||
| # Verify ownership and existence of dataset | ||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | ||
| return get_error_data_result(message="You don't own the dataset.") | ||
|
|
||
| # Verify document exists in the dataset | ||
| doc = DocumentService.query(id=document_id, kb_id=dataset_id) | ||
| if not doc: | ||
| return get_error_data_result( | ||
| message=f"Document {document_id} not found in dataset {dataset_id}" | ||
| ) | ||
| doc = doc[0] | ||
|
|
||
| # Get request body | ||
| req = await get_request_json() | ||
| if "metadata" not in req: | ||
| return get_error_argument_result(message="metadata is required") | ||
|
Comment on lines
+862
to
+877
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Permission check inconsistency and missing null guard on request body. Two concerns in this handler:
🛠️ Suggested changes- # Verify ownership and existence of dataset
- if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
- return get_error_data_result(message="You don't own the dataset.")
+ # Verify ownership and existence of dataset
+ if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
+ return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
@@
- # Get request body
- req = await get_request_json()
- if "metadata" not in req:
- return get_error_argument_result(message="metadata is required")
+ # Get request body
+ req = await get_request_json() or {}
+ if not isinstance(req, dict) or "metadata" not in req:
+ return get_error_argument_result(message="metadata is required")#!/bin/bash
# Confirm which permission helper is used across restful_apis handlers.
rg -nP "KnowledgebaseService\.(accessible|query)\(" --type=py -g '!**/test/**'🤖 Prompt for AI Agents |
||
|
|
||
| # Update parser config with metadata | ||
| DocumentService.update_parser_config(doc.id, {"metadata": req["metadata"]}) | ||
|
|
||
| # Get updated document | ||
| e, doc = DocumentService.get_by_id(doc.id) | ||
| if not e: | ||
| return get_data_error_result(message="Document not found!") | ||
|
|
||
| return get_result(data=doc.to_dict()) | ||
|
Comment on lines
+820
to
+887
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Add logging and error handling to the new metadata-config flow. This new handler has no As per coding guidelines: "Add logging for new flows". |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Inverted dedup logic: duplicates are kept, uniques are discarded.
When
check_duplicate_idsreports duplicates, the code logs a warning and proceeds with the original (duplicated)doc_ids; only when there are no duplicates does it assignunique_doc_ids. The branches are reversed — on duplicates you should be usingunique_doc_ids, not the unfiltered list.In practice this is masked today because
DeleteDocumentReq.validate_idsalready rejects duplicates with a 101 error before reaching this block, making this code effectively dead. Either fix the inversion or drop the redundant check.🛠️ Proposed fix
🤖 Prompt for AI Agents