-
Notifications
You must be signed in to change notification settings - Fork 8.9k
Refactor: Doc batch change status #14337
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
679ef4d
e3b95fa
384c6a3
0b24737
a575993
a6c2283
f75c909
00d70f9
e9ffa1d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -264,15 +264,15 @@ async def upload_document(dataset_id, tenant_id): | |
| """ | ||
| from api.constants import FILE_NAME_LEN_LIMIT | ||
| from api.db.services.file_service import FileService | ||
|
|
||
| form = await request.form | ||
| files = await request.files | ||
|
|
||
| # Validation | ||
| if "file" not in files: | ||
| logging.error("No file part!") | ||
| return get_error_data_result(message="No file part!", code=RetCode.ARGUMENT_ERROR) | ||
|
|
||
| file_objs = files.getlist("file") | ||
| for file_obj in file_objs: | ||
| if file_obj is None or file_obj.filename is None or file_obj.filename == "": | ||
|
|
@@ -288,7 +288,7 @@ async def upload_document(dataset_id, tenant_id): | |
| if not e: | ||
| logging.error(f"Can't find the dataset with ID {dataset_id}!") | ||
| return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR) | ||
|
|
||
| # Permission Check | ||
| if not check_kb_team_permission(kb, tenant_id): | ||
| logging.error("No authorization.") | ||
|
|
@@ -308,7 +308,7 @@ async def upload_document(dataset_id, tenant_id): | |
| msg = "There seems to be an issue with your file format. please verify it is correct and not corrupted." | ||
| logging.error(msg) | ||
| return get_error_data_result(message=msg, code=RetCode.DATA_ERROR) | ||
|
|
||
| files = [f[0] for f in files] # remove the blob | ||
|
|
||
| # Check if we should return raw files without document key mapping | ||
|
|
@@ -580,7 +580,7 @@ def _parse_doc_id_filter_with_metadata(req, kb_id): | |
| - The metadata_condition uses operators like: =, !=, >, <, >=, <=, contains, not contains, | ||
| in, not in, start with, end with, empty, not empty. | ||
| - The metadata parameter performs exact matching where values are OR'd within the same key | ||
| & AND'd across different keys. | ||
| and AND'd across different keys. | ||
|
|
||
| Examples: | ||
| Simple metadata filter (exact match): | ||
|
|
@@ -758,8 +758,6 @@ async def delete_documents(tenant_id, dataset_id): | |
| except Exception as e: | ||
| logging.exception(e) | ||
| return get_error_data_result(message="Internal server error") | ||
|
|
||
|
|
||
| def _aggregate_filters(docs): | ||
| """Aggregate filter options from a list of documents. | ||
|
|
||
|
|
@@ -1019,3 +1017,122 @@ async def update_metadata(tenant_id, dataset_id): | |
| target_doc_ids = list(target_doc_ids) | ||
| updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes) | ||
| return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)}) | ||
|
|
||
|
|
||
| @manager.route("/datasets/<dataset_id>/documents/batch-update-status", methods=["POST"]) # noqa: F821 | ||
| @login_required | ||
| @add_tenant_id_to_kwargs | ||
| async def batch_update_document_status(tenant_id, dataset_id): | ||
| """ | ||
| Batch update status of documents within a dataset. | ||
| --- | ||
| tags: | ||
| - Documents | ||
| security: | ||
| - ApiKeyAuth: [] | ||
| parameters: | ||
| - in: path | ||
| name: dataset_id | ||
| type: string | ||
| required: true | ||
| description: ID of the dataset. | ||
| - in: header | ||
| name: Authorization | ||
| type: string | ||
| required: true | ||
| description: Bearer token for authentication. | ||
| - in: body | ||
| name: body | ||
| description: Document status update parameters. | ||
| required: true | ||
| schema: | ||
| type: object | ||
| required: | ||
| - doc_ids | ||
| - status | ||
| properties: | ||
| doc_ids: | ||
| type: array | ||
| items: | ||
| type: string | ||
| description: List of document IDs to update. | ||
| status: | ||
| type: string | ||
| enum: ["0", "1"] | ||
| description: New status (0 = disabled, 1 = enabled). | ||
| responses: | ||
| 200: | ||
| description: Document statuses updated successfully. | ||
| """ | ||
| from common import settings | ||
| from rag.nlp import search | ||
|
|
||
| req = await get_request_json() | ||
| doc_ids = req.get("doc_ids", []) | ||
| status = str(req.get("status", -1)) | ||
|
|
||
| if status not in ["0", "1"]: | ||
| return get_error_argument_result(message=f'"Status" must be either 0 or 1:{status}!') | ||
|
|
||
| # Verify dataset ownership | ||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | ||
| return get_error_data_result(message="You don't own the dataset.") | ||
|
Comment on lines
+1077
to
+1079
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Verify which access check is used across dataset-scoped routes in the restful_apis module
rg -nP -C2 'KnowledgebaseService\.(accessible|query)\(' api/apps/restful_apis/Repository: infiniflow/ragflow Length of output: 6975 🏁 Script executed: sed -n '1050,1090p' api/apps/restful_apis/document_api.py | cat -nRepository: infiniflow/ragflow Length of output: 1614 🏁 Script executed: sed -n '1030,1055p' api/apps/restful_apis/document_api.py | cat -nRepository: infiniflow/ragflow Length of output: 871 🏁 Script executed: sed -n '1010,1035p' api/apps/restful_apis/document_api.py | cat -nRepository: infiniflow/ragflow Length of output: 1235 🏁 Script executed: sed -n '400,450p' api/apps/restful_apis/document_api.py | cat -nRepository: infiniflow/ragflow Length of output: 2564 🏁 Script executed: sed -n '850,900p' api/apps/restful_apis/document_api.py | cat -nRepository: infiniflow/ragflow Length of output: 2175 🏁 Script executed: sed -n '900,950p' api/apps/restful_apis/document_api.py | cat -nRepository: infiniflow/ragflow Length of output: 1838 🏁 Script executed: sed -n '948,980p' api/apps/restful_apis/document_api.py | cat -nRepository: infiniflow/ragflow Length of output: 1649 Align access check with similar batch operations for consistency. The endpoints 🤖 Prompt for AI Agents |
||
|
|
||
| e, kb = KnowledgebaseService.get_by_id(dataset_id) | ||
| if not e: | ||
| return get_error_data_result(message="Can't find this dataset!") | ||
|
|
||
| result = {} | ||
| has_error = False | ||
| for doc_id in doc_ids: | ||
| try: | ||
| e, doc = DocumentService.get_by_id(doc_id) | ||
| if not e: | ||
| result[doc_id] = {"error": "Document not found"} | ||
| has_error = True | ||
| continue | ||
|
|
||
| if doc.kb_id != dataset_id: | ||
| logging.warning(f"Document {doc.kb_id} not in dataset {dataset_id}") | ||
| result[doc_id] = {"error": "Document not found in this dataset."} | ||
| has_error = True | ||
| continue | ||
|
Comment on lines
+1096
to
+1099
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix misleading log message. The warning logs 🪵 Proposed fix- if doc.kb_id != dataset_id:
- logging.warning(f"Document {doc.kb_id} not in dataset {dataset_id}")
+ if doc.kb_id != dataset_id:
+ logging.warning(f"Document {doc_id} (kb_id={doc.kb_id}) not in dataset {dataset_id}")
result[doc_id] = {"error": "Document not found in this dataset."}
has_error = True
continueAs per coding guidelines, 🤖 Prompt for AI Agents |
||
|
|
||
| current_status = str(doc.status) | ||
| if current_status == status: | ||
| result[doc_id] = {"status": status} | ||
| continue | ||
| if not DocumentService.update_by_id(doc_id, {"status": str(status)}): | ||
| result[doc_id] = {"error": "Database error (Document update)!"} | ||
| has_error = True | ||
| continue | ||
|
|
||
| status_int = int(status) | ||
| if getattr(doc, "chunk_num", 0) > 0: | ||
| try: | ||
| ok = settings.docStoreConn.update( | ||
| {"doc_id": doc_id}, | ||
| {"available_int": status_int}, | ||
| search.index_name(kb.tenant_id), | ||
| doc.kb_id, | ||
| ) | ||
| except Exception as exc: | ||
| msg = str(exc) | ||
| if "3022" in msg: | ||
| result[doc_id] = {"error": "Document store table missing."} | ||
| else: | ||
| result[doc_id] = {"error": f"Document store update failed: {msg}"} | ||
| has_error = True | ||
| continue | ||
| if not ok: | ||
| result[doc_id] = {"error": "Database error (docStore update)!"} | ||
| has_error = True | ||
| continue | ||
|
Comment on lines
+1105
to
+1130
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid committing DB status before doc store update succeeds. For chunked docs, Line 1107 updates the DB first. If 🔁 Proposed rollback guard- if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
+ db_updated = False
+ if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
result[doc_id] = {"error": "Database error (Document update)!"}
has_error = True
continue
+ db_updated = True
status_int = int(status)
if getattr(doc, "chunk_num", 0) > 0:
try:
ok = settings.docStoreConn.update(
@@
except Exception as exc:
+ if db_updated:
+ DocumentService.update_by_id(doc_id, {"status": current_status})
msg = str(exc)
if "3022" in msg:
result[doc_id] = {"error": "Document store table missing."}
else:
result[doc_id] = {"error": f"Document store update failed: {msg}"}
@@
if not ok:
+ if db_updated:
+ DocumentService.update_by_id(doc_id, {"status": current_status})
result[doc_id] = {"error": "Database error (docStore update)!"}
has_error = True
continue🤖 Prompt for AI Agents |
||
| result[doc_id] = {"status": status} | ||
| except Exception as e: | ||
| result[doc_id] = {"error": f"Internal server error: {str(e)}"} | ||
| has_error = True | ||
|
Comment on lines
+1132
to
+1134
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Log unexpected per-document failures. The broad handler returns an internal-error result but drops the traceback, making this new endpoint hard to diagnose in production. 🪵 Proposed logging except Exception as e:
+ logging.exception("Failed to batch update document status for document %s", doc_id)
result[doc_id] = {"error": f"Internal server error: {str(e)}"}
has_error = TrueAs per coding guidelines, 🤖 Prompt for AI Agents |
||
|
|
||
| if has_error: | ||
| return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR) | ||
| return get_json_result(data=result) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Validate
doc_idsbefore iterating.doc_idsdefaults to[]and is not checked for list/non-empty string IDs. A malformed JSON body like"doc_ids": "abc"would iterate characters, while a missing field returns success with no work.🛡️ Proposed validation
🤖 Prompt for AI Agents