Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 0 additions & 70 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,76 +201,6 @@ def thumbnails():
return server_error_response(e)


@manager.route("/change_status", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids", "status")
async def change_status():
req = await get_request_json()
doc_ids = req.get("doc_ids", [])
status = str(req.get("status", ""))

if status not in ["0", "1"]:
return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=RetCode.ARGUMENT_ERROR)

result = {}
has_error = False
for doc_id in doc_ids:
if not DocumentService.accessible(doc_id, current_user.id):
result[doc_id] = {"error": "No authorization."}
has_error = True
continue

try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
result[doc_id] = {"error": "No authorization."}
has_error = True
continue
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
if not e:
result[doc_id] = {"error": "Can't find this dataset!"}
has_error = True
continue
current_status = str(doc.status)
if current_status == status:
result[doc_id] = {"status": status}
continue
if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
result[doc_id] = {"error": "Database error (Document update)!"}
has_error = True
continue

status_int = int(status)
if getattr(doc, "chunk_num", 0) > 0:
try:
ok = settings.docStoreConn.update(
{"doc_id": doc_id},
{"available_int": status_int},
search.index_name(kb.tenant_id),
doc.kb_id,
)
except Exception as exc:
msg = str(exc)
if "3022" in msg:
result[doc_id] = {"error": "Document store table missing."}
else:
result[doc_id] = {"error": f"Document store update failed: {msg}"}
has_error = True
continue
if not ok:
result[doc_id] = {"error": "Database error (docStore update)!"}
has_error = True
continue
result[doc_id] = {"status": status}
except Exception as e:
result[doc_id] = {"error": f"Internal server error: {str(e)}"}
has_error = True

if has_error:
return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR)
return get_json_result(data=result)


@manager.route("/run", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids", "run")
Expand Down
133 changes: 125 additions & 8 deletions api/apps/restful_apis/document_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,15 +264,15 @@ async def upload_document(dataset_id, tenant_id):
"""
from api.constants import FILE_NAME_LEN_LIMIT
from api.db.services.file_service import FileService

form = await request.form
files = await request.files

# Validation
if "file" not in files:
logging.error("No file part!")
return get_error_data_result(message="No file part!", code=RetCode.ARGUMENT_ERROR)

file_objs = files.getlist("file")
for file_obj in file_objs:
if file_obj is None or file_obj.filename is None or file_obj.filename == "":
Expand All @@ -288,7 +288,7 @@ async def upload_document(dataset_id, tenant_id):
if not e:
logging.error(f"Can't find the dataset with ID {dataset_id}!")
return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR)

# Permission Check
if not check_kb_team_permission(kb, tenant_id):
logging.error("No authorization.")
Expand All @@ -308,7 +308,7 @@ async def upload_document(dataset_id, tenant_id):
msg = "There seems to be an issue with your file format. please verify it is correct and not corrupted."
logging.error(msg)
return get_error_data_result(message=msg, code=RetCode.DATA_ERROR)

files = [f[0] for f in files] # remove the blob

# Check if we should return raw files without document key mapping
Expand Down Expand Up @@ -580,7 +580,7 @@ def _parse_doc_id_filter_with_metadata(req, kb_id):
- The metadata_condition uses operators like: =, !=, >, <, >=, <=, contains, not contains,
in, not in, start with, end with, empty, not empty.
- The metadata parameter performs exact matching where values are OR'd within the same key
& AND'd across different keys.
and AND'd across different keys.

Examples:
Simple metadata filter (exact match):
Expand Down Expand Up @@ -758,8 +758,6 @@ async def delete_documents(tenant_id, dataset_id):
except Exception as e:
logging.exception(e)
return get_error_data_result(message="Internal server error")


def _aggregate_filters(docs):
"""Aggregate filter options from a list of documents.

Expand Down Expand Up @@ -1019,3 +1017,122 @@ async def update_metadata(tenant_id, dataset_id):
target_doc_ids = list(target_doc_ids)
updated = DocMetadataService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes)
return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)})


@manager.route("/datasets/<dataset_id>/documents/batch-update-status", methods=["POST"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def batch_update_document_status(tenant_id, dataset_id):
"""
Batch update status of documents within a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Document status update parameters.
required: true
schema:
type: object
required:
- doc_ids
- status
properties:
doc_ids:
type: array
items:
type: string
description: List of document IDs to update.
status:
type: string
enum: ["0", "1"]
description: New status (0 = disabled, 1 = enabled).
responses:
200:
description: Document statuses updated successfully.
"""
from common import settings
from rag.nlp import search

req = await get_request_json()
doc_ids = req.get("doc_ids", [])
status = str(req.get("status", -1))

if status not in ["0", "1"]:
return get_error_argument_result(message=f'"Status" must be either 0 or 1:{status}!')

Comment on lines +1070 to +1076
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Validate doc_ids before iterating.

doc_ids defaults to [] and is not checked for list/non-empty string IDs. A malformed JSON body like "doc_ids": "abc" would iterate characters, while a missing field returns success with no work.

🛡️ Proposed validation
     req = await get_request_json()
-    doc_ids = req.get("doc_ids", [])
+    if not isinstance(req, dict):
+        return get_error_argument_result(message="Request body must be a JSON object.")
+
+    doc_ids = req.get("doc_ids")
     status = str(req.get("status", -1))
 
+    if not isinstance(doc_ids, list) or not doc_ids:
+        return get_error_argument_result(message='"doc_ids" must be a non-empty list.')
+    if any(not isinstance(doc_id, str) or not doc_id for doc_id in doc_ids):
+        return get_error_argument_result(message='"doc_ids" must contain non-empty document IDs.')
+
     if status not in ["0", "1"]:
         return get_error_argument_result(message=f'"Status" must be either 0 or 1:{status}!')
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 1072 - 1078, The handler
reads req = await get_request_json() and sets doc_ids = req.get("doc_ids", [])
but doesn't validate it; update the logic around doc_ids (before iterating) to
ensure it's either a list of non-empty string IDs or a single non-empty string
(accept and normalize by wrapping into a list), reject other types and empty
lists/IDs by returning get_error_argument_result with a clear message; keep the
existing status validation and use the same error helper
(get_error_argument_result) for consistency.

# Verify dataset ownership
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(message="You don't own the dataset.")
Comment on lines +1077 to +1079
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify which access check is used across dataset-scoped routes in the restful_apis module
rg -nP -C2 'KnowledgebaseService\.(accessible|query)\(' api/apps/restful_apis/

Repository: infiniflow/ragflow

Length of output: 6975


🏁 Script executed:

sed -n '1050,1090p' api/apps/restful_apis/document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 1614


🏁 Script executed:

sed -n '1030,1055p' api/apps/restful_apis/document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 871


🏁 Script executed:

sed -n '1010,1035p' api/apps/restful_apis/document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 1235


🏁 Script executed:

sed -n '400,450p' api/apps/restful_apis/document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 2564


🏁 Script executed:

sed -n '850,900p' api/apps/restful_apis/document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 2175


🏁 Script executed:

sed -n '900,950p' api/apps/restful_apis/document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 1838


🏁 Script executed:

sed -n '948,980p' api/apps/restful_apis/document_api.py | cat -n

Repository: infiniflow/ragflow

Length of output: 1649


Align access check with similar batch operations for consistency.

The endpoints update_metadata (line 958) and delete_documents (line 432) use KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id), granting team members access to batch operations. This endpoint uses the stricter KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id), restricting to the owner. If batch status updates should be accessible to team members like delete/update operations, switch to accessible; otherwise, consider adding a comment explaining the intentionally stricter access requirement.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 1077 - 1079, The access
check for batch status updates uses KnowledgebaseService.query(id=dataset_id,
tenant_id=tenant_id) which restricts to the owner, but similar batch endpoints
(update_metadata and delete_documents) use
KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id) to allow
team members; change the check to use
KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id) so team
members can perform batch status updates, or if stricter owner-only behavior is
intended, add a clarifying comment above this block explaining why query is
required instead of accessible.


e, kb = KnowledgebaseService.get_by_id(dataset_id)
if not e:
return get_error_data_result(message="Can't find this dataset!")

result = {}
has_error = False
for doc_id in doc_ids:
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
result[doc_id] = {"error": "Document not found"}
has_error = True
continue

if doc.kb_id != dataset_id:
logging.warning(f"Document {doc.kb_id} not in dataset {dataset_id}")
result[doc_id] = {"error": "Document not found in this dataset."}
has_error = True
continue
Comment on lines +1096 to +1099
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fix misleading log message.

The warning logs doc.kb_id labeled as "Document ... not in dataset", which prints the document's kb_id rather than the document id. This makes partial-failure diagnosis harder for the new endpoint.

🪵 Proposed fix
-            if doc.kb_id != dataset_id:
-                logging.warning(f"Document {doc.kb_id} not in dataset {dataset_id}")
+            if doc.kb_id != dataset_id:
+                logging.warning(f"Document {doc_id} (kb_id={doc.kb_id}) not in dataset {dataset_id}")
                 result[doc_id] = {"error": "Document not found in this dataset."}
                 has_error = True
                 continue

As per coding guidelines, **/*.py: Add logging for new flows.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 1096 - 1099, The warning
uses doc.kb_id instead of the actual document identifier, causing misleading
logs; update the log in the error branch (where variables doc_id and dataset_id
are used, alongside result and has_error) to log the actual document id (doc_id)
and dataset id, and optionally include doc.kb_id for extra context so the
message reads something like "Document {doc_id} (kb_id={doc.kb_id}) not in
dataset {dataset_id}" while leaving the rest of the error handling (setting
result[doc_id], has_error=True, continue) unchanged.


current_status = str(doc.status)
if current_status == status:
result[doc_id] = {"status": status}
continue
if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
result[doc_id] = {"error": "Database error (Document update)!"}
has_error = True
continue

status_int = int(status)
if getattr(doc, "chunk_num", 0) > 0:
try:
ok = settings.docStoreConn.update(
{"doc_id": doc_id},
{"available_int": status_int},
search.index_name(kb.tenant_id),
doc.kb_id,
)
except Exception as exc:
msg = str(exc)
if "3022" in msg:
result[doc_id] = {"error": "Document store table missing."}
else:
result[doc_id] = {"error": f"Document store update failed: {msg}"}
has_error = True
continue
if not ok:
result[doc_id] = {"error": "Database error (docStore update)!"}
has_error = True
continue
Comment on lines +1105 to +1130
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid committing DB status before doc store update succeeds.

For chunked docs, Line 1107 updates the DB first. If docStoreConn.update fails afterward, the API returns partial failure but leaves Document.status changed while search availability remains stale.

🔁 Proposed rollback guard
-            if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
+            db_updated = False
+            if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
                 result[doc_id] = {"error": "Database error (Document update)!"}
                 has_error = True
                 continue
+            db_updated = True
 
             status_int = int(status)
             if getattr(doc, "chunk_num", 0) > 0:
                 try:
                     ok = settings.docStoreConn.update(
@@
                 except Exception as exc:
+                    if db_updated:
+                        DocumentService.update_by_id(doc_id, {"status": current_status})
                     msg = str(exc)
                     if "3022" in msg:
                         result[doc_id] = {"error": "Document store table missing."}
                     else:
                         result[doc_id] = {"error": f"Document store update failed: {msg}"}
@@
                 if not ok:
+                    if db_updated:
+                        DocumentService.update_by_id(doc_id, {"status": current_status})
                     result[doc_id] = {"error": "Database error (docStore update)!"}
                     has_error = True
                     continue
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 1107 - 1132, The code
updates Document.status via DocumentService.update_by_id before updating the
external doc store (settings.docStoreConn.update), which can leave DB state
inconsistent if the doc store update fails; modify the flow so that for chunked
documents (getattr(doc, "chunk_num", 0) > 0) you perform the
settings.docStoreConn.update first and only call
DocumentService.update_by_id({"status": str(status)}) after the doc store update
returns truthy, or if you must update DB first then catch failures from
settings.docStoreConn.update and call DocumentService.update_by_id(doc_id,
{"status": previous_status}) to roll back the change (use the current stored
status from `doc`), and ensure error handling sets result[doc_id] and has_error
accordingly; reference functions/objects: DocumentService.update_by_id,
settings.docStoreConn.update, search.index_name(kb.tenant_id), and doc.kb_id.

result[doc_id] = {"status": status}
except Exception as e:
result[doc_id] = {"error": f"Internal server error: {str(e)}"}
has_error = True
Comment on lines +1132 to +1134
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Log unexpected per-document failures.

The broad handler returns an internal-error result but drops the traceback, making this new endpoint hard to diagnose in production.

🪵 Proposed logging
         except Exception as e:
+            logging.exception("Failed to batch update document status for document %s", doc_id)
             result[doc_id] = {"error": f"Internal server error: {str(e)}"}
             has_error = True

As per coding guidelines, **/*.py: Add logging for new flows.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@api/apps/restful_apis/document_api.py` around lines 1134 - 1136, The except
block that sets result[doc_id] and has_error swallows the traceback; add
structured logging of the unexpected per-document failure by calling the
module-level logger (e.g., logger.exception or logger.error with traceback)
inside the except for the function that processes documents (reference variables
doc_id, result, has_error) so the error message includes the doc_id and full
exception/traceback; ensure a logger is defined/imported in this module if
missing.


if has_error:
return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR)
return get_json_result(data=result)
12 changes: 10 additions & 2 deletions test/testcases/test_web_api/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,8 +424,16 @@ def document_update_metadata_setting(auth, dataset_id, doc_id, payload=None, *,
return res.json()


def document_change_status(auth, payload=None, *, headers=HEADERS, data=None):
res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/change_status", headers=headers, auth=auth, json=payload, data=data)
def document_change_status(auth, dataset_id, payload=None, *, headers=HEADERS, data=None):
"""
Batch update document status within a dataset.

Args:
auth: Authentication credentials
dataset_id: ID of the dataset
payload: Request body containing doc_ids and status
"""
res = requests.post(url=f"{HOST_ADDRESS}{DATASETS_URL}/{dataset_id}/documents/batch-update-status", headers=headers, auth=auth, json=payload, data=data)
return res.json()


Expand Down
Loading
Loading