diff --git a/admin/client/ragflow_client.py b/admin/client/ragflow_client.py index b35dde2642c..b9f04783ced 100644 --- a/admin/client/ragflow_client.py +++ b/admin/client/ragflow_client.py @@ -1612,7 +1612,7 @@ def set_metadata(self, command_dict): print(f"no document found for {doc_id}") return - dataset_id = docs[0].get("kb_id") + dataset_id = docs[0].get("dataset_id") if not dataset_id: print(f"Dataset ID not found for document: {doc_id}") return @@ -1753,7 +1753,7 @@ def _wait_parse_done(self, dataset_name: str, dataset_id: str): return False all_done = True for doc in docs: - if doc.get("run") != "3": + if doc.get("run") != "DONE": print(f"Document {doc["name"]} is not done, status: {doc.get("run")}") all_done = False break @@ -1764,8 +1764,13 @@ def _wait_parse_done(self, dataset_name: str, dataset_id: str): time.sleep(0.5) def _list_documents(self, dataset_name: str, dataset_id: str): - response = self.http_client.request("POST", f"/document/list?id={dataset_id}", use_api_base=False, - auth_kind="web") + # Use the new RESTful API: GET /api/v1/datasets//documents + response = self.http_client.request( + "GET", + f"/datasets/{dataset_id}/documents", + use_api_base=True, + auth_kind="web" + ) res_json = response.json() if response.status_code != 200: print( diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 5660724bbee..9a9cafb9b1c 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -44,7 +44,6 @@ from common import settings from common.constants import SANDBOX_ARTIFACT_BUCKET, VALID_TASK_STATUS, ParserType, RetCode, TaskStatus from common.file_utils import get_project_base_directory -from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema from common.misc_utils import get_uuid, thread_pool_exec from deepdoc.parser.html_parser import RAGFlowHtmlParser from rag.nlp import search @@ -185,139 +184,6 @@ async def create(): return server_error_response(e) -@manager.route("/list", methods=["POST"]) # noqa: F821 -@login_required -async def list_docs(): - kb_id = request.args.get("id") - if not kb_id: - return get_json_result(data=False, message='Dataset ID is required for listing files.', code=RetCode.ARGUMENT_ERROR) - tenants = UserTenantService.query(user_id=current_user.id) - for tenant in tenants: - if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id): - break - else: - return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR) - keywords = request.args.get("keywords", "") - - page_number = int(request.args.get("page", 0)) - items_per_page = int(request.args.get("page_size", 0)) - orderby = request.args.get("orderby", "create_time") - if request.args.get("desc", "true").lower() == "false": - desc = False - else: - desc = True - create_time_from = int(request.args.get("create_time_from", 0)) - create_time_to = int(request.args.get("create_time_to", 0)) - - req = await get_request_json() - - return_empty_metadata = req.get("return_empty_metadata", False) - if isinstance(return_empty_metadata, str): - return_empty_metadata = return_empty_metadata.lower() == "true" - - run_status = req.get("run_status", []) - if run_status: - invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS} - if invalid_status: - return get_data_error_result(message=f"Invalid filter run status conditions: {', '.join(invalid_status)}") - - types = req.get("types", []) - if types: - invalid_types = {t for t in types if t not in VALID_FILE_TYPES} - if invalid_types: - return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}") - - suffix = req.get("suffix", []) - metadata_condition = req.get("metadata_condition", {}) or {} - metadata = req.get("metadata", {}) or {} - if isinstance(metadata, dict) and metadata.get("empty_metadata"): - return_empty_metadata = True - metadata = {k: v for k, v in metadata.items() if k != "empty_metadata"} - if return_empty_metadata: - metadata_condition = {} - metadata = {} - else: - if metadata_condition and not isinstance(metadata_condition, dict): - return get_data_error_result(message="metadata_condition must be an object.") - if metadata and not isinstance(metadata, dict): - return get_data_error_result(message="metadata must be an object.") - - doc_ids_filter = None - metas = None - if metadata_condition or metadata: - metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id]) - - if metadata_condition: - doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) - if metadata_condition.get("conditions") and not doc_ids_filter: - return get_json_result(data={"total": 0, "docs": []}) - - if metadata: - metadata_doc_ids = None - for key, values in metadata.items(): - if not values: - continue - if not isinstance(values, list): - values = [values] - values = [str(v) for v in values if v is not None and str(v).strip()] - if not values: - continue - key_doc_ids = set() - for value in values: - key_doc_ids.update(metas.get(key, {}).get(value, [])) - if metadata_doc_ids is None: - metadata_doc_ids = key_doc_ids - else: - metadata_doc_ids &= key_doc_ids - if not metadata_doc_ids: - return get_json_result(data={"total": 0, "docs": []}) - if metadata_doc_ids is not None: - if doc_ids_filter is None: - doc_ids_filter = metadata_doc_ids - else: - doc_ids_filter &= metadata_doc_ids - if not doc_ids_filter: - return get_json_result(data={"total": 0, "docs": []}) - - if doc_ids_filter is not None: - doc_ids_filter = list(doc_ids_filter) - - try: - docs, tol = DocumentService.get_by_kb_id( - kb_id, - page_number, - items_per_page, - orderby, - desc, - keywords, - run_status, - types, - suffix, - doc_ids_filter, - return_empty_metadata=return_empty_metadata, - ) - - if create_time_from or create_time_to: - filtered_docs = [] - for doc in docs: - doc_create_time = doc.get("create_time", 0) - if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to): - filtered_docs.append(doc) - docs = filtered_docs - - for doc_item in docs: - if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): - doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}" - if doc_item.get("source_type"): - doc_item["source_type"] = doc_item["source_type"].split("/")[0] - if doc_item["parser_config"].get("metadata"): - doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"]) - - return get_json_result(data={"total": tol, "docs": docs}) - except Exception as e: - return server_error_response(e) - - @manager.route("/filter", methods=["POST"]) # noqa: F821 @login_required async def get_filter(): diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 598bf6ffb7a..b2e749f3e51 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -14,23 +14,27 @@ # limitations under the License. # import logging +import json from quart import request from peewee import OperationalError from pydantic import ValidationError -from api.apps.services.document_api_service import map_doc_keys_with_run_status, validate_document_update_fields, \ - update_document_name_only, update_chunk_method_only, update_document_status_only, map_doc_keys +from api.apps import login_required +from api.apps.services.document_api_service import validate_document_update_fields, map_doc_keys, \ + map_doc_keys_with_run_status, update_document_name_only, update_chunk_method_only, update_document_status_only +from api.constants import IMG_BASE64_PREFIX +from api.db import VALID_FILE_TYPES from api.db.services.doc_metadata_service import DocMetadataService from api.db.services.document_service import DocumentService from api.db.services.knowledgebase_service import KnowledgebaseService -from common.constants import RetCode -from api.apps import login_required -from api.utils.api_utils import get_error_data_result, get_result, add_tenant_id_to_kwargs, get_request_json, \ - server_error_response +from api.utils.api_utils import get_data_error_result, get_error_data_result, get_result, get_json_result, \ + server_error_response, add_tenant_id_to_kwargs, get_request_json from api.utils.validation_utils import ( UpdateDocumentReq, format_validation_error_message, ) +from common.constants import RetCode +from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema @manager.route("/datasets//documents/", methods=["PATCH"]) # noqa: F821 @login_required @@ -316,3 +320,343 @@ async def upload_document(dataset_id, tenant_id): return get_result(data=renamed_doc_list) +@manager.route("/datasets//documents", methods=["GET"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +def list_docs(dataset_id, tenant_id): + """ + List documents in a dataset. + --- + tags: + - Documents + security: + - ApiKeyAuth: [] + parameters: + - in: path + name: dataset_id + type: string + required: true + description: ID of the dataset. + - in: query + name: page + type: integer + required: false + default: 1 + description: Page number. + - in: query + name: page_size + type: integer + required: false + default: 30 + description: Number of items per page. + - in: query + name: orderby + type: string + required: false + default: "create_time" + description: Field to order by. + - in: query + name: desc + type: boolean + required: false + default: true + description: Order in descending. + - in: query + name: create_time_from + type: integer + required: false + default: 0 + description: Unix timestamp for filtering documents created after this time. 0 means no filter. + - in: query + name: create_time_to + type: integer + required: false + default: 0 + description: Unix timestamp for filtering documents created before this time. 0 means no filter. + - in: query + name: suffix + type: array + items: + type: string + required: false + description: Filter by file suffix (e.g., ["pdf", "txt", "docx"]). + - in: query + name: run + type: array + items: + type: string + required: false + description: Filter by document run status. Supports both numeric ("0", "1", "2", "3", "4") and text formats ("UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"). + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + responses: + 200: + description: List of documents. + schema: + type: object + properties: + total: + type: integer + description: Total number of documents. + docs: + type: array + items: + type: object + properties: + id: + type: string + description: Document ID. + name: + type: string + description: Document name. + chunk_count: + type: integer + description: Number of chunks. + token_count: + type: integer + description: Number of tokens. + dataset_id: + type: string + description: ID of the dataset. + chunk_method: + type: string + description: Chunking method used. + run: + type: string + description: Processing status. + """ + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + logging.error(f"You don't own the dataset {dataset_id}. ") + return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") + + err_code, err_msg, docs, total = _get_docs_with_request(request, dataset_id) + if err_code != RetCode.SUCCESS: + return get_data_error_result(code=err_code, message=err_msg) + + renamed_doc_list = [map_doc_keys(doc) for doc in docs] + for doc_item in renamed_doc_list: + if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): + doc_item["thumbnail"] = f"/v1/document/image/{dataset_id}-{doc_item['thumbnail']}" + if doc_item.get("source_type"): + doc_item["source_type"] = doc_item["source_type"].split("/")[0] + if doc_item["parser_config"].get("metadata"): + doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"]) + + return get_json_result(data={"total": total, "docs": renamed_doc_list}) + + +def _get_docs_with_request(req, dataset_id:str): + """Get documents with request parameters from a dataset. + + This function extracts filtering parameters from the request and returns + a list of documents matching the specified criteria. + + Args: + req: The request object containing query parameters. + - page (int): Page number for pagination (default: 1). + - page_size (int): Number of documents per page (default: 30). + - orderby (str): Field to order by (default: "create_time"). + - desc (bool): Whether to order in descending order (default: True). + - keywords (str): Keywords to search in document names. + - suffix (list): File suffix filters. + - types (list): Document type filters. + - run (list): Processing status filters. + - create_time_from (int): Start timestamp for time range filter. + - create_time_to (int): End timestamp for time range filter. + - return_empty_metadata (bool|str): Whether to return documents with empty metadata. + - metadata_condition (str): JSON string for complex metadata conditions. + - metadata (str): JSON string for simple metadata key-value matching. + dataset_id: The dataset ID to retrieve documents from. + + Returns: + A tuple of (err_code, err_message, docs, total): + - err_code (int): Success code (RetCode.SUCCESS) if successful, or error code if validation fails. + - err_message (str): Empty string if successful, or error message if validation fails. + - docs (list): List of document dictionaries matching the criteria, or empty list on error. + - total (int): Total number of documents matching the criteria. + + Note: + - The function supports filtering by document types, processing status, keywords, and time range. + - Metadata filtering supports both simple key-value matching and complex conditions with operators. + """ + q = req.args + + page = int(q.get("page", 1)) + page_size = int(q.get("page_size", 30)) + + orderby = q.get("orderby", "create_time") + desc = str(q.get("desc", "true")).strip().lower() != "false" + keywords = q.get("keywords", "") + + # filters - align with OpenAPI parameter names + suffix = q.getlist("suffix") + + types = q.getlist("types") + if types: + invalid_types = {t for t in types if t not in VALID_FILE_TYPES} + if invalid_types: + msg = f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}" + return RetCode.DATA_ERROR, msg, [], 0 + + # map run status (text or numeric) - align with API parameter + run_status = q.getlist("run") + run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"} + run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status] + if run_status_converted: + invalid_status = {s for s in run_status_converted if s not in run_status_text_to_numeric.values()} + if invalid_status: + msg = f"Invalid filter run status conditions: {', '.join(invalid_status)}" + return RetCode.DATA_ERROR, msg, [], 0 + + err_code, err_message, doc_ids_filter, return_empty_metadata = _parse_doc_id_filter_with_metadata(q, dataset_id) + if err_code != RetCode.SUCCESS: + return err_code, err_message, [], 0 + + doc_name = q.get("name") + doc_id = q.get("id") + if doc_id and not DocumentService.query(id=doc_id, kb_id=dataset_id): + return RetCode.DATA_ERROR, f"You don't own the document {doc_id}.", [], 0 + if doc_name and not DocumentService.query(name=doc_name, kb_id=dataset_id): + return RetCode.DATA_ERROR, f"You don't own the document {doc_name}.", [], 0 + + docs, total = DocumentService.get_by_kb_id(dataset_id, page, page_size, orderby, desc, keywords, run_status_converted, types, suffix, + doc_id=doc_id, name=doc_name, doc_ids_filter=doc_ids_filter, return_empty_metadata=return_empty_metadata) + + # time range filter (0 means no bound) + create_time_from = int(q.get("create_time_from", 0)) + create_time_to = int(q.get("create_time_to", 0)) + if create_time_from or create_time_to: + docs = [d for d in docs if (create_time_from == 0 or d.get("create_time", 0) >= create_time_from) and (create_time_to == 0 or d.get("create_time", 0) <= create_time_to)] + + return RetCode.SUCCESS, "", docs, total + +def _parse_doc_id_filter_with_metadata(req, kb_id): + """Parse document ID filter based on metadata conditions from the request. + + This function extracts and processes metadata filtering parameters from the request + and returns a list of document IDs that match the specified criteria. It supports + two filtering modes: simple metadata key-value matching and complex metadata + conditions with operators. + + Args: + req: The request object containing filtering parameters. + - return_empty_metadata (bool|str): If True, returns all documents regardless + of their metadata. Can be a boolean or string "true"/"false". + - metadata_condition (str): JSON string containing complex metadata conditions + with optional "logic" (and/or) and "conditions" list. Each condition should + have "name" (key), "comparison_operator", and "value" fields. + - metadata (str): JSON string containing key-value pairs for exact metadata + matching. Values can be a single value or list of values (OR logic within + same key). Can include special key "empty_metadata" to indicate documents + with empty metadata. + kb_id: The knowledge base ID to filter documents from. + + Returns: + A tuple of (err_code, err_message, docs, return_empty_metadata): + - err_code (int): Success code (RetCode.SUCCESS) if successful, or error code if validation fails. + - err_message (str): Empty string if successful, or error message if validation fails. + - docs (list): List of document IDs matching the metadata criteria, + or empty list if no filter should be applied or on error. + - return_empty_metadata (bool): The processed flag indicating whether to + return documents with empty metadata. + + Note: + - When both metadata and metadata_condition are provided, they are combined with AND logic. + - The metadata_condition uses operators like: =, !=, >, <, >=, <=, contains, not contains, + in, not in, start with, end with, empty, not empty. + - The metadata parameter performs exact matching where values are OR'd within the same key + and AND'd across different keys. + + Examples: + Simple metadata filter (exact match): + req = {"metadata": '{"author": ["John", "Jane"]}'} + # Returns documents where author is John OR Jane + + Simple metadata filter with multiple keys: + req = {"metadata": '{"author": "John", "status": "published"}'} + # Returns documents where author is John AND status is published + + Complex metadata conditions: + req = {"metadata_condition": '{"logic": "and", "conditions": [{"name": "status", "comparison_operator": "eq", "value": "published"}]}'} + # Returns documents where status equals "published" + + Complex conditions with multiple operators: + req = {"metadata_condition": '{"logic": "or", "conditions": [{"name": "priority", "comparison_operator": "=", "value": "high"}, {"name": "status", "comparison_operator": "contains", "value": "urgent"}]}'} + # Returns documents where priority is high OR status contains "urgent" + + Return empty metadata: + req = {"return_empty_metadata": True} + # Returns all documents regardless of metadata + + Combined metadata and metadata_condition: + req = {"metadata": '{"author": "John"}', "metadata_condition": '{"logic": "and", "conditions": [{"name": "status", "comparison_operator": "=", "value": "published"}]}'} + # Returns documents where author is John AND status equals published + """ + return_empty_metadata = req.get("return_empty_metadata", False) + if isinstance(return_empty_metadata, str): + return_empty_metadata = return_empty_metadata.lower() == "true" + + try: + metadata_condition = json.loads(req.get("metadata_condition", "{}")) + except json.JSONDecodeError: + msg = f'metadata_condition must be valid JSON: {req.get("metadata_condition")}.' + return RetCode.DATA_ERROR, msg, [], return_empty_metadata + try: + metadata = json.loads(req.get("metadata", "{}")) + except json.JSONDecodeError: + logging.error(msg=f'metadata must be valid JSON: {req.get("metadata")}.') + return RetCode.DATA_ERROR, "metadata must be valid JSON.", [], return_empty_metadata + + if isinstance(metadata, dict) and metadata.get("empty_metadata"): + return_empty_metadata = True + metadata = {k: v for k, v in metadata.items() if k != "empty_metadata"} + if return_empty_metadata: + metadata_condition = {} + metadata = {} + else: + if metadata_condition and not isinstance(metadata_condition, dict): + return RetCode.DATA_ERROR, "metadata_condition must be an object.", [], return_empty_metadata + if metadata and not isinstance(metadata, dict): + return RetCode.DATA_ERROR, "metadata must be an object.", [], return_empty_metadata + + doc_ids_filter = None + metas = None + if metadata_condition or metadata: + metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id]) + + if metadata_condition: + doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) + if metadata_condition.get("conditions") and not doc_ids_filter: + return RetCode.SUCCESS, "", [], return_empty_metadata + + if metadata: + metadata_doc_ids = None + for key, values in metadata.items(): + if not values: + continue + if not isinstance(values, list): + values = [values] + values = [str(v) for v in values if v is not None and str(v).strip()] + if not values: + continue + key_doc_ids = set() + for value in values: + key_doc_ids.update(metas.get(key, {}).get(value, [])) + if metadata_doc_ids is None: + metadata_doc_ids = key_doc_ids + else: + metadata_doc_ids &= key_doc_ids + if not metadata_doc_ids: + return RetCode.SUCCESS, "", [], return_empty_metadata + if metadata_doc_ids is not None: + if doc_ids_filter is None: + doc_ids_filter = metadata_doc_ids + else: + doc_ids_filter &= metadata_doc_ids + if not doc_ids_filter: + return RetCode.SUCCESS, "", [], return_empty_metadata + + return RetCode.SUCCESS, "", list(doc_ids_filter) if doc_ids_filter is not None else [], return_empty_metadata diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 244c6f22924..bff583e4976 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -14,7 +14,6 @@ # limitations under the License. # import datetime -import json import re from io import BytesIO @@ -159,187 +158,6 @@ async def download_doc(document_id): ) -@manager.route("/datasets//documents", methods=["GET"]) # noqa: F821 -@token_required -def list_docs(dataset_id, tenant_id): - """ - List documents in a dataset. - --- - tags: - - Documents - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: dataset_id - type: string - required: true - description: ID of the dataset. - - in: query - name: id - type: string - required: false - description: Filter by document ID. - - in: query - name: page - type: integer - required: false - default: 1 - description: Page number. - - in: query - name: page_size - type: integer - required: false - default: 30 - description: Number of items per page. - - in: query - name: orderby - type: string - required: false - default: "create_time" - description: Field to order by. - - in: query - name: desc - type: boolean - required: false - default: true - description: Order in descending. - - in: query - name: create_time_from - type: integer - required: false - default: 0 - description: Unix timestamp for filtering documents created after this time. 0 means no filter. - - in: query - name: create_time_to - type: integer - required: false - default: 0 - description: Unix timestamp for filtering documents created before this time. 0 means no filter. - - in: query - name: suffix - type: array - items: - type: string - required: false - description: Filter by file suffix (e.g., ["pdf", "txt", "docx"]). - - in: query - name: run - type: array - items: - type: string - required: false - description: Filter by document run status. Supports both numeric ("0", "1", "2", "3", "4") and text formats ("UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"). - - in: header - name: Authorization - type: string - required: true - description: Bearer token for authentication. - responses: - 200: - description: List of documents. - schema: - type: object - properties: - total: - type: integer - description: Total number of documents. - docs: - type: array - items: - type: object - properties: - id: - type: string - description: Document ID. - name: - type: string - description: Document name. - chunk_count: - type: integer - description: Number of chunks. - token_count: - type: integer - description: Number of tokens. - dataset_id: - type: string - description: ID of the dataset. - chunk_method: - type: string - description: Chunking method used. - run: - type: string - description: Processing status. - """ - if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): - return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") - - q = request.args - document_id = q.get("id") - name = q.get("name") - - if document_id and not DocumentService.query(id=document_id, kb_id=dataset_id): - return get_error_data_result(message=f"You don't own the document {document_id}.") - if name and not DocumentService.query(name=name, kb_id=dataset_id): - return get_error_data_result(message=f"You don't own the document {name}.") - - page = int(q.get("page", 1)) - page_size = int(q.get("page_size", 30)) - orderby = q.get("orderby", "create_time") - desc = str(q.get("desc", "true")).strip().lower() != "false" - keywords = q.get("keywords", "") - - # filters - align with OpenAPI parameter names - suffix = q.getlist("suffix") - run_status = q.getlist("run") - create_time_from = int(q.get("create_time_from", 0)) - create_time_to = int(q.get("create_time_to", 0)) - metadata_condition_raw = q.get("metadata_condition") - metadata_condition = {} - if metadata_condition_raw: - try: - metadata_condition = json.loads(metadata_condition_raw) - except Exception: - return get_error_data_result(message="metadata_condition must be valid JSON.") - if metadata_condition and not isinstance(metadata_condition, dict): - return get_error_data_result(message="metadata_condition must be an object.") - - # map run status (text or numeric) - align with API parameter - run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"} - run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status] - - doc_ids_filter = None - if metadata_condition: - metas = DocMetadataService.get_flatted_meta_by_kbs([dataset_id]) - doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) - if metadata_condition.get("conditions") and not doc_ids_filter: - return get_result(data={"total": 0, "docs": []}) - - docs, total = DocumentService.get_list(dataset_id, page, page_size, orderby, desc, keywords, document_id, name, suffix, run_status_converted, doc_ids_filter) - - # time range filter (0 means no bound) - if create_time_from or create_time_to: - docs = [d for d in docs if (create_time_from == 0 or d.get("create_time", 0) >= create_time_from) and (create_time_to == 0 or d.get("create_time", 0) <= create_time_to)] - - # rename keys + map run status back to text for output - key_mapping = { - "chunk_num": "chunk_count", - "kb_id": "dataset_id", - "token_num": "token_count", - "parser_id": "chunk_method", - } - run_status_numeric_to_text = {"0": "UNSTART", "1": "RUNNING", "2": "CANCEL", "3": "DONE", "4": "FAIL"} - - output_docs = [] - for d in docs: - renamed_doc = {key_mapping.get(k, k): v for k, v in d.items()} - if "run" in d: - renamed_doc["run"] = run_status_numeric_to_text.get(str(d["run"]), d["run"]) - output_docs.append(renamed_doc) - - return get_result(data={"total": total, "docs": output_docs}) - - @manager.route("/datasets//metadata/update", methods=["POST"]) # noqa: F821 @token_required async def metadata_batch_update(dataset_id, tenant_id): diff --git a/api/apps/services/document_api_service.py b/api/apps/services/document_api_service.py index 5a23f403a10..82dfa37e353 100644 --- a/api/apps/services/document_api_service.py +++ b/api/apps/services/document_api_service.py @@ -165,6 +165,7 @@ def validate_document_update_fields(update_doc_req:UpdateDocumentReq, doc, req): return None, None + def map_doc_keys(doc): """ Rename document keys to match API response format. @@ -241,9 +242,9 @@ def _process_run_mapping(doc, run_status): Args: doc: The document model from the database OR a dictionary. - run_status: Optional explicit run status value. If not provided: - - If doc has 'run' field, it will be mapped using run_mapping - - Otherwise, 'run' will be set to 'UNSTART' (for new uploads) + run_status: Optional explicit run status value. + If provided, 'run' field of doc will be set to run_status. + If not provided, 'run' will be set to 'UNSTART' (for new uploads) Returns: A dictionary with renamed keys for API response. @@ -262,5 +263,3 @@ def _process_run_mapping(doc, run_status): doc["run"] = run_mapping[run_status] return doc - - diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 2a4f21baf18..0c6e8b89195 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -127,7 +127,7 @@ def check_doc_health(cls, tenant_id: str, filename): @classmethod @DB.connection_context() - def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids=None, return_empty_metadata=False): + def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_id=None, name=None, doc_ids_filter=None, return_empty_metadata=False): fields = cls.get_cls_model_fields() if keywords: docs = ( @@ -147,17 +147,19 @@ def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keyword .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER) .where(cls.model.kb_id == kb_id) ) - - if doc_ids: - docs = docs.where(cls.model.id.in_(doc_ids)) + if doc_id: + docs = docs.where(cls.model.id == doc_id) + if doc_ids_filter: + docs = docs.where(cls.model.id.in_(doc_ids_filter)) if run_status: docs = docs.where(cls.model.run.in_(run_status)) if types: docs = docs.where(cls.model.type.in_(types)) if suffix: docs = docs.where(cls.model.suffix.in_(suffix)) + if name: + docs = docs.where(cls.model.name == name) - metadata_map = {} if return_empty_metadata: metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id) doc_ids_with_metadata = set(metadata_map.keys()) diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py index 0e94b5c0748..9440c26b5c6 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py @@ -388,65 +388,6 @@ def test_download_and_download_doc_errors(self, monkeypatch): res = _run(module.download_doc("doc-1")) assert res["filename"] == "doc.txt" - def test_list_docs_metadata_filters(self, monkeypatch): - module = _load_doc_module(monkeypatch) - monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False) - monkeypatch.setattr(module, "request", SimpleNamespace(args=_DummyArgs())) - res = module.list_docs.__wrapped__("ds-1", "tenant-1") - assert "don't own the dataset" in res["message"] - - monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True) - monkeypatch.setattr( - module, - "request", - SimpleNamespace( - args=_DummyArgs( - { - "metadata_condition": "{bad json", - } - ) - ), - ) - res = module.list_docs.__wrapped__("ds-1", "tenant-1") - assert res["message"] == "metadata_condition must be valid JSON." - - monkeypatch.setattr(module, "request", SimpleNamespace(args=_DummyArgs({"metadata_condition": "[1]"}))) - res = module.list_docs.__wrapped__("ds-1", "tenant-1") - assert res["message"] == "metadata_condition must be an object." - - monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda _kbs: [{"doc_id": "x"}]) - monkeypatch.setattr(module, "meta_filter", lambda *_args, **_kwargs: []) - monkeypatch.setattr(module, "convert_conditions", lambda cond: cond) - monkeypatch.setattr( - module, - "request", - SimpleNamespace(args=_DummyArgs({"metadata_condition": '{"conditions":[{"field":"x","op":"eq","value":"y"}]}'})), - ) - res = module.list_docs.__wrapped__("ds-1", "tenant-1") - assert res["code"] == module.RetCode.SUCCESS - assert res["data"]["total"] == 0 - - monkeypatch.setattr( - module.DocumentService, - "get_list", - lambda *_args, **_kwargs: ([{"id": "doc-1", "create_time": 100, "run": "0"}], 1), - ) - monkeypatch.setattr( - module, - "request", - SimpleNamespace( - args=_DummyArgs( - { - "create_time_from": "101", - "create_time_to": "200", - } - ) - ), - ) - res = module.list_docs.__wrapped__("ds-1", "tenant-1") - assert res["code"] == 0 - assert res["data"]["docs"] == [] - def test_metadata_batch_update(self, monkeypatch): module = _load_doc_module(monkeypatch) monkeypatch.setattr(module, "convert_conditions", lambda cond: cond) diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py index fb4c26711f0..f2a2f5c905e 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py @@ -27,11 +27,11 @@ class TestAuthorization: @pytest.mark.parametrize( "invalid_auth, expected_code, expected_message", [ - (None, 0, "`Authorization` can't be empty"), + (None, 401, ""), ( RAGFlowHttpApiAuth(INVALID_API_TOKEN), - 109, - "Authentication error: API key is invalid!", + 401, + "", ), ], ) @@ -72,7 +72,7 @@ def test_invalid_dataset_id(self, HttpApiAuth, dataset_id, expected_code, expect "params, expected_code, expected_page_size, expected_message", [ ({"page": None, "page_size": 2}, 0, 2, ""), - ({"page": 0, "page_size": 2}, 0, 2, ""), + ({"page": 1, "page_size": 2}, 0, 2, ""), ({"page": 2, "page_size": 2}, 0, 2, ""), ({"page": 3, "page_size": 2}, 0, 1, ""), ({"page": "3", "page_size": 2}, 0, 1, ""), @@ -115,7 +115,6 @@ def test_page( "params, expected_code, expected_page_size, expected_message", [ ({"page_size": None}, 0, 5, ""), - ({"page_size": 0}, 0, 0, ""), ({"page_size": 1}, 0, 1, ""), ({"page_size": 6}, 0, 5, ""), ({"page_size": "1"}, 0, 1, ""), @@ -232,6 +231,7 @@ def test_keywords(self, HttpApiAuth, add_documents, params, expected_num): assert len(res["data"]["docs"]) == expected_num assert res["data"]["total"] == expected_num + @pytest.mark.p1 @pytest.mark.parametrize( "params, expected_code, expected_num, expected_message", @@ -240,21 +240,21 @@ def test_keywords(self, HttpApiAuth, add_documents, params, expected_num): ({"name": ""}, 0, 5, ""), ({"name": "ragflow_test_upload_0.txt"}, 0, 1, ""), ( - {"name": "unknown.txt"}, - 102, - 0, - "You don't own the document unknown.txt.", + {"name": "unknown.txt"}, + 102, + 0, + "You don't own the document unknown.txt.", ), ], ) def test_name( - self, - HttpApiAuth, - add_documents, - params, - expected_code, - expected_num, - expected_message, + self, + HttpApiAuth, + add_documents, + params, + expected_code, + expected_num, + expected_message, ): dataset_id, _ = add_documents res = list_documents(HttpApiAuth, dataset_id, params=params) @@ -267,6 +267,7 @@ def test_name( else: assert res["message"] == expected_message + @pytest.mark.p1 @pytest.mark.parametrize( "document_id, expected_code, expected_num, expected_message", @@ -278,13 +279,13 @@ def test_name( ], ) def test_id( - self, - HttpApiAuth, - add_documents, - document_id, - expected_code, - expected_num, - expected_message, + self, + HttpApiAuth, + add_documents, + document_id, + expected_code, + expected_num, + expected_message, ): dataset_id, document_ids = add_documents if callable(document_id): @@ -298,11 +299,13 @@ def test_id( if params["id"] in [None, ""]: assert len(res["data"]["docs"]) == expected_num else: - assert res["data"]["docs"][0]["id"] == params["id"] + doc = res["data"]["docs"][0] + assert doc["id"] == params["id"] else: assert res["message"] == expected_message - @pytest.mark.p3 + + @pytest.mark.p2 @pytest.mark.parametrize( "document_id, name, expected_code, expected_num, expected_message", [ @@ -310,23 +313,23 @@ def test_id( (lambda r: r[0], "ragflow_test_upload_1.txt", 0, 0, ""), (lambda r: r[0], "unknown", 102, 0, "You don't own the document unknown."), ( - "id", - "ragflow_test_upload_0.txt", - 102, - 0, - "You don't own the document id.", + "id", + "ragflow_test_upload_0.txt", + 102, + 0, + "You don't own the document id.", ), ], ) def test_name_and_id( - self, - HttpApiAuth, - add_documents, - document_id, - name, - expected_code, - expected_num, - expected_message, + self, + HttpApiAuth, + add_documents, + document_id, + name, + expected_code, + expected_num, + expected_message, ): dataset_id, document_ids = add_documents if callable(document_id): @@ -340,6 +343,7 @@ def test_name_and_id( else: assert res["message"] == expected_message + @pytest.mark.p3 def test_concurrent_list(self, HttpApiAuth, add_documents): dataset_id, _ = add_documents @@ -358,3 +362,83 @@ def test_invalid_params(self, HttpApiAuth, add_documents): res = list_documents(HttpApiAuth, dataset_id, params=params) assert res["code"] == 0 assert len(res["data"]["docs"]) == 5 + + @pytest.mark.p2 + @pytest.mark.parametrize( + "params, expected_code, expected_message", + [ + ( + {"metadata_condition": "{bad json"}, + 102, + "metadata_condition must be valid JSON", + ), + ( + {"metadata_condition": "[1]"}, + 102, + "metadata_condition must be an object", + ), + ], + ) + def test_metadata_condition_validation( + self, HttpApiAuth, add_documents, params, expected_code, expected_message + ): + dataset_id, _ = add_documents + res = list_documents(HttpApiAuth, dataset_id, params=params) + assert res["code"] == expected_code + assert expected_message in res["message"] + + @pytest.mark.p2 + @pytest.mark.parametrize( + "params, expected_code, expected_total", + [ + # Filter with create_time_from in the future - should return 0 results + ({"create_time_from": "9999999999000"}, 0, 0), + # Filter with create_time_to in the past - should return 0 results + ({"create_time_to": "1"}, 0, 0), + # Filter with create_time_from and create_time_to covering all time + ({"create_time_from": "0", "create_time_to": "9999999999000"}, 0, 5), + ], + ) + def test_create_time_filter( + self, HttpApiAuth, add_documents, params, expected_code, expected_total + ): + dataset_id, _ = add_documents + res = list_documents(HttpApiAuth, dataset_id, params=params) + + assert res["code"] == expected_code + assert len(res["data"]["docs"]) == expected_total + assert res["data"]["total"] == 5 + + @pytest.mark.p2 + @pytest.mark.parametrize( + "params, expected_code, expected_message", + [ + # Invalid run status - should return error + ({"run": ["INVALID_STATUS"]}, 102, "Invalid filter run status conditions: INVALID_STATUS"), + ], + ) + def test_run_status_filter_invalid( + self, HttpApiAuth, add_documents, params, expected_code, expected_message + ): + dataset_id, _ = add_documents + res = list_documents(HttpApiAuth, dataset_id, params=params) + + assert res["code"] == expected_code + assert expected_message in res["message"] + + @pytest.mark.p2 + @pytest.mark.parametrize( + "params, expected_size", + [ + # Invalid run status - should return error + ({"run": ["UNSTART"]}, 5), + ], + ) + def test_run_status_filter_unstart( + self, HttpApiAuth, add_documents, params, expected_size + ): + dataset_id, _ = add_documents + res = list_documents(HttpApiAuth, dataset_id, params=params) + + assert res["code"] == 0 + assert res["data"]["total"] == expected_size diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py index fd31e5ceeed..755d87cce77 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py @@ -42,6 +42,7 @@ def condition(_auth, _dataset_id, _document_ids=None): def validate_document_details(auth, dataset_id, document_ids): + # currently list_documents not support search by document id for document_id in document_ids: res = list_documents(auth, dataset_id, params={"id": document_id}) doc = res["data"]["docs"][0] diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py index c022a0e97bc..b24d9deeacf 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py @@ -42,7 +42,8 @@ def test_invalid_auth(self, invalid_auth, expected_code, expected_message): class TestDocumentsUpdated: - @pytest.mark.p1 + # GET /api/v1/datasets//documents no longer support find by id/name + @pytest.mark.p3 @pytest.mark.parametrize( "name, expected_code, expected_message", [ @@ -94,7 +95,8 @@ def test_name(self, HttpApiAuth, add_documents, name, expected_code, expected_me else: assert res["message"] == expected_message - @pytest.mark.p2 + # GET /api/v1/datasets//documents no longer support find by id/name + @pytest.mark.p3 @pytest.mark.parametrize( "document_id, expected_code, expected_message", [ @@ -206,10 +208,12 @@ def test_chunk_method(self, HttpApiAuth, add_documents, chunk_method, expected_c assert res["code"] == expected_code if expected_code == 0: res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]}) + doc_of_id = res["data"]["docs"][0] if chunk_method == "": - assert res["data"]["docs"][0]["chunk_method"] == "naive" + assert doc_of_id["chunk_method"] == "naive" else: - assert res["data"]["docs"][0]["chunk_method"] == chunk_method + print(f"doc:{doc_of_id}") + assert doc_of_id["chunk_method"] == chunk_method else: assert res["message"] == expected_message @@ -597,10 +601,12 @@ def test_parser_config( assert res["code"] == expected_code if expected_code == 0: res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]}) + + doc_of_id = res["data"]["docs"][0] if parser_config == {}: - assert res["data"]["docs"][0]["parser_config"] == DEFAULT_PARSER_CONFIG + assert doc_of_id["parser_config"] == DEFAULT_PARSER_CONFIG else: for k, v in parser_config.items(): - assert res["data"]["docs"][0]["parser_config"][k] == v + assert doc_of_id["parser_config"][k] == v if expected_code != 0 or expected_message: assert res["message"] == expected_message diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py index b60f5f2886c..58d8a7c6253 100644 --- a/test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py @@ -37,7 +37,7 @@ def cleanup(): def add_documents(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir) -> tuple[DataSet, list[Document]]: dataset = add_dataset documents = bulk_upload_documents(dataset, 5, ragflow_tmp_dir) - + def cleanup(): delete_all_documents(dataset) diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py index 9e8cea30d61..a438512dc09 100644 --- a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py @@ -30,7 +30,7 @@ def test_default(self, add_documents): "params, expected_page_size, expected_message", [ ({"page": None, "page_size": 2}, 2, "not instance of"), - ({"page": 0, "page_size": 2}, 2, ""), + ({"page": 1, "page_size": 2}, 2, ""), ({"page": 2, "page_size": 2}, 2, ""), ({"page": 3, "page_size": 2}, 1, ""), ({"page": "3", "page_size": 2}, 1, "not instance of"), @@ -63,7 +63,7 @@ def test_page(self, add_documents, params, expected_page_size, expected_message) "params, expected_page_size, expected_message", [ ({"page_size": None}, 5, "not instance of"), - ({"page_size": 0}, 0, ""), + ({"page_size": 2}, 2, ""), ({"page_size": 1}, 1, ""), ({"page_size": 6}, 5, ""), ({"page_size": "1"}, 1, "not instance of"), @@ -151,6 +151,7 @@ def test_keywords(self, add_documents, params, expected_num): documents = dataset.list_documents(**params) assert len(documents) == expected_num, str(documents) + @pytest.mark.p1 @pytest.mark.parametrize( "params, expected_num, expected_message", @@ -222,6 +223,7 @@ def test_name_and_id(self, add_documents, document_id, name, expected_num, expec documents = dataset.list_documents(**params) assert len(documents) == expected_num, str(documents) + @pytest.mark.p3 def test_concurrent_list(self, add_documents): dataset, _ = add_documents diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py index dc9ce49b75a..f174f0e5462 100644 --- a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py @@ -64,7 +64,8 @@ def test_name(self, add_documents, name, expected_message): assert expected_message in str(exception_info.value), str(exception_info.value) else: document.update({"name": name}) - updated_doc = dataset.list_documents(id=document.id)[0] + docs = dataset.list_documents(id=document.id) + updated_doc = [doc for doc in docs if doc.id == document.id][0] assert updated_doc.name == name, str(updated_doc) @pytest.mark.p2 @@ -138,7 +139,8 @@ def test_chunk_method(self, add_documents, chunk_method, expected_message): assert expected_message in str(exception_info.value), str(exception_info.value) else: document.update({"chunk_method": chunk_method}) - updated_doc = dataset.list_documents(id=document.id)[0] + docs = dataset.list_documents() + updated_doc = [doc for doc in docs if doc.id == document.id][0] assert updated_doc.chunk_method == chunk_method, str(updated_doc) @pytest.mark.p3 @@ -479,7 +481,8 @@ def test_parser_config(self, client, add_documents, chunk_method, parser_config, assert expected_message in str(exception_info.value), str(exception_info.value) else: document.update(update_data) - updated_doc = dataset.list_documents(id=document.id)[0] + docs = dataset.list_documents(id=document.id) + updated_doc = [doc for doc in docs if doc.id == document.id][0] if parser_config: for k, v in parser_config.items(): if isinstance(v, dict): diff --git a/test/testcases/test_web_api/conftest.py b/test/testcases/test_web_api/conftest.py index 4d4962e1c41..df57be3aa15 100644 --- a/test/testcases/test_web_api/conftest.py +++ b/test/testcases/test_web_api/conftest.py @@ -49,9 +49,9 @@ @wait_for(30, 1, "Document parsing timeout") def condition(_auth, _kb_id): - res = list_documents(_auth, {"id": _kb_id}) + res = list_documents(_auth, {"kb_id": _kb_id}) for doc in res["data"]["docs"]: - if doc["run"] != "3": + if doc["run"] != "DONE": return False return True diff --git a/test/testcases/test_web_api/test_chunk_app/conftest.py b/test/testcases/test_web_api/test_chunk_app/conftest.py index e4610178cef..0b413c75ff3 100644 --- a/test/testcases/test_web_api/test_chunk_app/conftest.py +++ b/test/testcases/test_web_api/test_chunk_app/conftest.py @@ -24,9 +24,9 @@ @wait_for(30, 1, "Document parsing timeout") def condition(_auth, _kb_id): - res = list_documents(_auth, {"id": _kb_id}) + res = list_documents(_auth, {"kb_id": _kb_id}) for doc in res["data"]["docs"]: - if doc["run"] != "3": + if doc["run"] != "DONE": return False return True diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py index 80e83afc42d..5d2b739a995 100644 --- a/test/testcases/test_web_api/test_common.py +++ b/test/testcases/test_web_api/test_common.py @@ -374,9 +374,11 @@ def create_document(auth, payload=None, *, headers=HEADERS, data=None): def list_documents(auth, params=None, payload=None, *, headers=HEADERS, data=None): + kb_id = params.get("kb_id") if params else None + url = f"{HOST_ADDRESS}/api/{VERSION}/datasets/{kb_id}/documents" if payload is None: payload = {} - res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/list", headers=headers, auth=auth, params=params, json=payload, data=data) + res = requests.get(url=url, headers=headers, auth=auth, params=params, json=payload, data=data) return res.json() diff --git a/test/testcases/test_web_api/test_document_app/conftest.py b/test/testcases/test_web_api/test_document_app/conftest.py index 3322cdf0119..ece9d25375d 100644 --- a/test/testcases/test_web_api/test_document_app/conftest.py +++ b/test/testcases/test_web_api/test_document_app/conftest.py @@ -34,7 +34,7 @@ def decorator(func): @pytest.fixture(scope="function") def add_document_func(request, WebApiAuth, add_dataset, ragflow_tmp_dir): def cleanup(): - res = list_documents(WebApiAuth, {"id": dataset_id}) + res = list_documents(WebApiAuth, {"kb_id": dataset_id}) for doc in res["data"]["docs"]: delete_document(WebApiAuth, {"doc_id": doc["id"]}) @@ -47,7 +47,7 @@ def cleanup(): @pytest.fixture(scope="class") def add_documents(request, WebApiAuth, add_dataset, ragflow_tmp_dir): def cleanup(): - res = list_documents(WebApiAuth, {"id": dataset_id}) + res = list_documents(WebApiAuth, {"kb_id": dataset_id}) for doc in res["data"]["docs"]: delete_document(WebApiAuth, {"doc_id": doc["id"]}) @@ -60,7 +60,7 @@ def cleanup(): @pytest.fixture(scope="function") def add_documents_func(request, WebApiAuth, add_dataset_func, ragflow_tmp_dir): def cleanup(): - res = list_documents(WebApiAuth, {"id": dataset_id}) + res = list_documents(WebApiAuth, {"kb_id": dataset_id}) for doc in res["data"]["docs"]: delete_document(WebApiAuth, {"doc_id": doc["id"]}) diff --git a/test/testcases/test_web_api/test_document_app/test_create_document.py b/test/testcases/test_web_api/test_document_app/test_create_document.py index 1b0b730787a..092c5e292f8 100644 --- a/test/testcases/test_web_api/test_document_app/test_create_document.py +++ b/test/testcases/test_web_api/test_document_app/test_create_document.py @@ -75,7 +75,7 @@ def test_filename_special_characters(self, WebApiAuth, add_dataset_func): res = create_document(WebApiAuth, {"name": filename, "kb_id": kb_id}) assert res["code"] == 0, res - assert res["data"]["kb_id"] == kb_id, res + assert res["data"]["dataset_id"] == kb_id, res assert res["data"]["name"] == filename, f"Expected: {filename}, Got: {res['data']['name']}" @pytest.mark.p3 diff --git a/test/testcases/test_web_api/test_document_app/test_list_documents.py b/test/testcases/test_web_api/test_document_app/test_list_documents.py index 91744ba232a..4005c077356 100644 --- a/test/testcases/test_web_api/test_document_app/test_list_documents.py +++ b/test/testcases/test_web_api/test_document_app/test_list_documents.py @@ -13,9 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import asyncio from concurrent.futures import ThreadPoolExecutor, as_completed -from types import SimpleNamespace import pytest from test_common import list_documents @@ -43,30 +41,18 @@ class TestDocumentsList: @pytest.mark.p1 def test_default(self, WebApiAuth, add_documents): kb_id, _ = add_documents - res = list_documents(WebApiAuth, {"id": kb_id}) - assert res["code"] == 0 + res = list_documents(WebApiAuth, {"kb_id": kb_id}) + assert res["code"] == 0, f", kb_id:{kb_id} +, res:{str(res)}" assert len(res["data"]["docs"]) == 5 assert res["data"]["total"] == 5 - @pytest.mark.p3 - @pytest.mark.parametrize( - "kb_id, expected_code, expected_message", - [ - ("", 101, 'Lack of "KB ID"'), - ("invalid_dataset_id", 103, "Only owner of dataset authorized for this operation."), - ], - ) - def test_invalid_dataset_id(self, WebApiAuth, kb_id, expected_code, expected_message): - res = list_documents(WebApiAuth, {"id": kb_id}) - assert res["code"] == expected_code - assert res["message"] == expected_message @pytest.mark.p1 @pytest.mark.parametrize( "params, expected_code, expected_page_size, expected_message", [ - ({"page": None, "page_size": 2}, 0, 5, ""), - ({"page": 0, "page_size": 2}, 0, 5, ""), + ({"page": None, "page_size": 5}, 0, 5, ""), + ({"page": 0, "page_size": 5}, 0, 5, ""), ({"page": 2, "page_size": 2}, 0, 2, ""), ({"page": 3, "page_size": 2}, 0, 1, ""), ({"page": "3", "page_size": 2}, 0, 1, ""), @@ -76,7 +62,7 @@ def test_invalid_dataset_id(self, WebApiAuth, kb_id, expected_code, expected_mes ) def test_page(self, WebApiAuth, add_documents, params, expected_code, expected_page_size, expected_message): kb_id, _ = add_documents - res = list_documents(WebApiAuth, {"id": kb_id, **params}) + res = list_documents(WebApiAuth, {"kb_id": kb_id, **params}) assert res["code"] == expected_code, res if expected_code == 0: assert len(res["data"]["docs"]) == expected_page_size, res @@ -89,17 +75,17 @@ def test_page(self, WebApiAuth, add_documents, params, expected_code, expected_p "params, expected_code, expected_page_size, expected_message", [ ({"page_size": None}, 0, 5, ""), - ({"page_size": 0}, 0, 5, ""), - ({"page_size": 1}, 0, 5, ""), + ({"page_size": 5}, 0, 5, ""), + ({"page_size": 1}, 0, 1, ""), ({"page_size": 6}, 0, 5, ""), - ({"page_size": "1"}, 0, 5, ""), + ({"page_size": "1"}, 0, 1, ""), pytest.param({"page_size": -1}, 100, 0, "1064", marks=pytest.mark.skip(reason="issues/5851")), pytest.param({"page_size": "a"}, 100, 0, """ValueError("invalid literal for int() with base 10: 'a'")""", marks=pytest.mark.skip(reason="issues/5851")), ], ) def test_page_size(self, WebApiAuth, add_documents, params, expected_code, expected_page_size, expected_message): kb_id, _ = add_documents - res = list_documents(WebApiAuth, {"id": kb_id, **params}) + res = list_documents(WebApiAuth, {"kb_id": kb_id, **params}) assert res["code"] == expected_code, res if expected_code == 0: assert len(res["data"]["docs"]) == expected_page_size, res @@ -119,7 +105,7 @@ def test_page_size(self, WebApiAuth, add_documents, params, expected_code, expec ) def test_orderby(self, WebApiAuth, add_documents, params, expected_code, assertions, expected_message): kb_id, _ = add_documents - res = list_documents(WebApiAuth, {"id": kb_id, **params}) + res = list_documents(WebApiAuth, {"kb_id": kb_id, **params}) assert res["code"] == expected_code, res if expected_code == 0: if callable(assertions): @@ -144,7 +130,7 @@ def test_orderby(self, WebApiAuth, add_documents, params, expected_code, asserti ) def test_desc(self, WebApiAuth, add_documents, params, expected_code, assertions, expected_message): kb_id, _ = add_documents - res = list_documents(WebApiAuth, {"id": kb_id, **params}) + res = list_documents(WebApiAuth, {"kb_id": kb_id, **params}) assert res["code"] == expected_code, res if expected_code == 0: if callable(assertions): @@ -165,7 +151,7 @@ def test_desc(self, WebApiAuth, add_documents, params, expected_code, assertions ) def test_keywords(self, WebApiAuth, add_documents, params, expected_num): kb_id, _ = add_documents - res = list_documents(WebApiAuth, {"id": kb_id, **params}) + res = list_documents(WebApiAuth, {"kb_id": kb_id, **params}) assert res["code"] == 0, res assert len(res["data"]["docs"]) == expected_num, res assert res["data"]["total"] == expected_num, res @@ -181,213 +167,53 @@ def test_concurrent_list(self, WebApiAuth, add_documents): assert len(responses) == count, responses assert all(future.result()["code"] == 0 for future in futures), responses + # Tests moved from TestDocumentsListUnit + @pytest.mark.p2 + def test_missing_kb_id(self, WebApiAuth): + """Test missing KB ID returns error.""" + res = list_documents(WebApiAuth, {"kb_id": ""}) + assert res["code"] == 100 + assert res["message"] == "" -def _run(coro): - return asyncio.run(coro) - - -class _DummyArgs(dict): - def get(self, key, default=None): - return super().get(key, default) - - -@pytest.mark.p2 -class TestDocumentsListUnit: - def _set_args(self, module, monkeypatch, **kwargs): - monkeypatch.setattr(module, "request", SimpleNamespace(args=_DummyArgs(kwargs))) - - def _allow_kb(self, module, monkeypatch, kb_id="kb1", tenant_id="tenant1"): - monkeypatch.setattr(module.UserTenantService, "query", lambda **_kwargs: [SimpleNamespace(tenant_id=tenant_id)]) - monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: True if _kwargs.get("id") == kb_id else False) - - def test_missing_kb_id(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch) - - async def fake_request_json(): - return {} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 101 - assert res["message"] == 'Dataset ID is required for listing files.' - - def test_unauthorized_dataset(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - monkeypatch.setattr(module.UserTenantService, "query", lambda **_kwargs: [SimpleNamespace(tenant_id="tenant1")]) - monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: False) - - async def fake_request_json(): - return {} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 103 - assert "Only owner of dataset" in res["message"] - - def test_return_empty_metadata_flags(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - self._allow_kb(module, monkeypatch) - monkeypatch.setattr(module.DocumentService, "get_by_kb_id", lambda *_args, **_kwargs: ([], 0)) - - async def fake_request_json(): - return {"return_empty_metadata": "true", "metadata": {"author": "alice"}} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 0 - - async def fake_request_json_empty(): - return {"metadata": {"empty_metadata": True, "author": "alice"}} - - monkeypatch.setattr(module, "get_request_json", fake_request_json_empty) - res = _run(module.list_docs()) - assert res["code"] == 0 - - def test_invalid_filters(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - self._allow_kb(module, monkeypatch) - - async def fake_request_json(): - return {"run_status": ["INVALID"]} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 102 - assert "Invalid filter run status" in res["message"] - - async def fake_request_json_types(): - return {"types": ["INVALID"]} - - monkeypatch.setattr(module, "get_request_json", fake_request_json_types) - res = _run(module.list_docs()) + @pytest.mark.p2 + def test_unauthorized_dataset(self, WebApiAuth): + """Test unauthorized dataset returns error.""" + res = list_documents(WebApiAuth, {"kb_id": "non_existent_kb_id"}) assert res["code"] == 102 - assert "Invalid filter conditions" in res["message"] - - def test_invalid_metadata_types(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - self._allow_kb(module, monkeypatch) - - async def fake_request_json(): - return {"metadata_condition": "bad"} + assert "You don't own the dataset" in res["message"] - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) + @pytest.mark.p3 + def test_invalid_run_status_filter(self, WebApiAuth, add_documents): + """Test invalid run status filter returns error.""" + kb_id, _ = add_documents + res = list_documents(WebApiAuth, {"kb_id": kb_id, "run": "INVALID"}) assert res["code"] == 102 - assert "metadata_condition" in res["message"] - - async def fake_request_json_meta(): - return {"metadata": ["not", "object"]} + assert "Invalid filter run status" in res["message"] - monkeypatch.setattr(module, "get_request_json", fake_request_json_meta) - res = _run(module.list_docs()) + @pytest.mark.p3 + def test_invalid_document_id_filter(self, WebApiAuth, add_documents): + """Test invalid document ID filter returns error.""" + kb_id, _ = add_documents + # Use a non-existent document ID + res = list_documents(WebApiAuth, {"kb_id": kb_id, "id": "non_existent_doc_id"}) assert res["code"] == 102 - assert "metadata must be an object" in res["message"] - - def test_metadata_condition_empty_result(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - self._allow_kb(module, monkeypatch) - monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda *_args, **_kwargs: {}) - monkeypatch.setattr(module, "meta_filter", lambda *_args, **_kwargs: set()) - - async def fake_request_json(): - return {"metadata_condition": {"conditions": [{"name": "author", "comparison_operator": "is", "value": "alice"}]}} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 0 - assert res["data"]["total"] == 0 - - def test_metadata_values_intersection(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - self._allow_kb(module, monkeypatch) - metas = { - "author": {"alice": ["doc1", "doc2"]}, - "topic": {"rag": ["doc2"]}, - } - monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda *_args, **_kwargs: metas) - - captured = {} - - def fake_get_by_kb_id(*_args, **_kwargs): - if len(_args) >= 10: - captured["doc_ids_filter"] = _args[9] - else: - captured["doc_ids_filter"] = None - return ([{"id": "doc2", "thumbnail": "", "parser_config": {}}], 1) - - monkeypatch.setattr(module.DocumentService, "get_by_kb_id", fake_get_by_kb_id) - - async def fake_request_json(): - return {"metadata": {"author": ["alice", " ", None], "topic": "rag"}} + assert "You don't own the document" in res["message"] - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 0 - assert captured["doc_ids_filter"] == ["doc2"] - - def test_metadata_intersection_empty(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - self._allow_kb(module, monkeypatch) - metas = { - "author": {"alice": ["doc1"]}, - "topic": {"rag": ["doc2"]}, - } - monkeypatch.setattr(module.DocMetadataService, "get_flatted_meta_by_kbs", lambda *_args, **_kwargs: metas) - - async def fake_request_json(): - return {"metadata": {"author": "alice", "topic": "rag"}} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 0 - assert res["data"]["total"] == 0 - - def test_desc_time_and_schema(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1", desc="false", create_time_from="150", create_time_to="250") - self._allow_kb(module, monkeypatch) - - docs = [ - {"id": "doc1", "thumbnail": "", "parser_config": {"metadata": {"a": 1}}, "create_time": 100}, - {"id": "doc2", "thumbnail": "", "parser_config": {"metadata": {"b": 2}}, "create_time": 200}, - ] - - def fake_get_by_kb_id(*_args, **_kwargs): - return (docs, 2) - - monkeypatch.setattr(module.DocumentService, "get_by_kb_id", fake_get_by_kb_id) - monkeypatch.setattr(module, "turn2jsonschema", lambda _meta: {"schema": True}) - - async def fake_request_json(): - return {} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) + @pytest.mark.p3 + def test_create_time_filter(self, WebApiAuth, add_documents): + """Test create time range filter.""" + kb_id, _ = add_documents + # Get current time range + res = list_documents(WebApiAuth, {"kb_id": kb_id}) assert res["code"] == 0 - assert len(res["data"]["docs"]) == 1 - assert res["data"]["docs"][0]["parser_config"]["metadata"] == {"schema": True} - - def test_exception_path(self, document_app_module, monkeypatch): - module = document_app_module - self._set_args(module, monkeypatch, id="kb1") - self._allow_kb(module, monkeypatch) + if res["data"]["docs"]: + create_time = res["data"]["docs"][0].get("create_time", 0) + # Test with time range that should include the document + res = list_documents(WebApiAuth, {"kb_id": kb_id, "create_time_from": 0, "create_time_to": create_time + 1000}) + assert res["code"] == 0 + assert len(res["data"]["docs"]) > 0 + # Test with time range that should not include the document + res = list_documents(WebApiAuth, {"kb_id": kb_id, "create_time_from": create_time + 1000, "create_time_to": create_time + 2000}) + assert res["code"] == 0 + assert len(res["data"]["docs"]) == 0 - def raise_error(*_args, **_kwargs): - raise RuntimeError("boom") - - monkeypatch.setattr(module.DocumentService, "get_by_kb_id", raise_error) - - async def fake_request_json(): - return {} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - res = _run(module.list_docs()) - assert res["code"] == 100 diff --git a/test/testcases/test_web_api/test_document_app/test_paser_documents.py b/test/testcases/test_web_api/test_document_app/test_paser_documents.py index a5cdfc32f02..79d6e26976f 100644 --- a/test/testcases/test_web_api/test_document_app/test_paser_documents.py +++ b/test/testcases/test_web_api/test_document_app/test_paser_documents.py @@ -30,29 +30,28 @@ def _run(coro): @wait_for(30, 1, "Document parsing timeout") def condition(_auth, _kb_id, _document_ids=None): - res = list_documents(_auth, {"id": _kb_id}) + res = list_documents(_auth, {"kb_id": _kb_id}) target_docs = res["data"]["docs"] - if _document_ids is None: for doc in target_docs: - if doc["run"] != "3": + if doc["run"] != "DONE": return False return True target_ids = set(_document_ids) for doc in target_docs: if doc["id"] in target_ids: - if doc.get("run") != "3": + if doc.get("run") != "DONE": return False return True def validate_document_parse_done(auth, _kb_id, _document_ids): - res = list_documents(auth, {"id": _kb_id}) + res = list_documents(auth, {"kb_id": _kb_id}) for doc in res["data"]["docs"]: if doc["id"] not in _document_ids: continue - assert doc["run"] == "3" + assert doc["run"] == "DONE" assert len(doc["process_begin_at"]) > 0 assert doc["process_duration"] > 0 assert doc["progress"] > 0 @@ -60,11 +59,11 @@ def validate_document_parse_done(auth, _kb_id, _document_ids): def validate_document_parse_cancel(auth, _kb_id, _document_ids): - res = list_documents(auth, {"id": _kb_id}) + res = list_documents(auth, {"kb_id": _kb_id}) for doc in res["data"]["docs"]: if doc["id"] not in _document_ids: continue - assert doc["run"] == "2" + assert doc["run"] == "CANCEL" assert len(doc["process_begin_at"]) > 0 assert doc["progress"] == 0.0 @@ -151,9 +150,9 @@ def test_duplicate_parse(self, WebApiAuth, add_documents_func): def test_parse_100_files(WebApiAuth, add_dataset_func, tmp_path): @wait_for(100, 1, "Document parsing timeout") def condition(_auth, _kb_id, _document_num): - res = list_documents(_auth, {"id": _kb_id, "page_size": _document_num}) + res = list_documents(_auth, {"kb_id": _kb_id, "page_size": _document_num}) for doc in res["data"]["docs"]: - if doc["run"] != "3": + if doc["run"] != "DONE": return False return True @@ -172,7 +171,7 @@ def condition(_auth, _kb_id, _document_num): def test_concurrent_parse(WebApiAuth, add_dataset_func, tmp_path): @wait_for(120, 1, "Document parsing timeout") def condition(_auth, _kb_id, _document_num): - res = list_documents(_auth, {"id": _kb_id, "page_size": _document_num}) + res = list_documents(_auth, {"kb_id": _kb_id, "page_size": _document_num}) for doc in res["data"]["docs"]: if doc["run"] != "3": return False @@ -305,15 +304,16 @@ class TestDocumentsParseStop: def test_basic_scenarios(self, WebApiAuth, add_documents_func, payload, expected_code, expected_message): @wait_for(30, 1, "Document parsing timeout") def condition(_auth, _kb_id, _doc_ids): - res = list_documents(_auth, {"id": _kb_id}) + res = list_documents(_auth, {"kb_id": _kb_id}) for doc in res["data"]["docs"]: if doc["id"] in _doc_ids: - if doc["run"] != "3": + if doc["run"] != "DONE": return False return True kb_id, document_ids = add_documents_func - parse_documents(WebApiAuth, {"doc_ids": document_ids, "run": "1"}) + parse_documents(WebApiAuth, {"doc_ids": document_ids, "run": + "1"}) if callable(payload): payload = payload(document_ids) diff --git a/test/testcases/test_web_api/test_document_app/test_rm_documents.py b/test/testcases/test_web_api/test_document_app/test_rm_documents.py index 7a19259e582..81a8e76aef5 100644 --- a/test/testcases/test_web_api/test_document_app/test_rm_documents.py +++ b/test/testcases/test_web_api/test_document_app/test_rm_documents.py @@ -63,7 +63,7 @@ def test_basic_scenarios(self, WebApiAuth, add_documents_func, payload, expected if res["code"] != 0: assert res["message"] == expected_message, res - res = list_documents(WebApiAuth, {"id": kb_id}) + res = list_documents(WebApiAuth, {"kb_id": kb_id}) assert len(res["data"]["docs"]) == remaining, res assert res["data"]["total"] == remaining, res @@ -124,12 +124,12 @@ def test_delete_100(WebApiAuth, add_dataset, tmp_path): documents_num = 100 kb_id = add_dataset document_ids = bulk_upload_documents(WebApiAuth, kb_id, documents_num, tmp_path) - res = list_documents(WebApiAuth, {"id": kb_id}) + res = list_documents(WebApiAuth, {"kb_id": kb_id}) assert res["data"]["total"] == documents_num, res for doc_id in document_ids: res = delete_document(WebApiAuth, {"doc_id": doc_id}) assert res["code"] == 0, res - res = list_documents(WebApiAuth, {"id": kb_id}) + res = list_documents(WebApiAuth, {"kb_id": kb_id}) assert res["data"]["total"] == 0, res diff --git a/test/testcases/test_web_api/test_kb_app/test_kb_pipeline_tasks.py b/test/testcases/test_web_api/test_kb_app/test_kb_pipeline_tasks.py index dea41652933..a4dfe50c773 100644 --- a/test/testcases/test_web_api/test_kb_app/test_kb_pipeline_tasks.py +++ b/test/testcases/test_web_api/test_kb_app/test_kb_pipeline_tasks.py @@ -75,7 +75,7 @@ def _condition(): def _wait_for_docs_parsed(auth, kb_id, timeout=60): @wait_for(timeout, 2, "Document parsing timeout") def _condition(): - res = list_documents(auth, {"id": kb_id}) + res = list_documents(auth, {"kb_id": kb_id}) if res["code"] != 0: return False for doc in res["data"]["docs"]: diff --git a/web/src/constants/knowledge.ts b/web/src/constants/knowledge.ts index 58f8ffac18c..9b26f68ed05 100644 --- a/web/src/constants/knowledge.ts +++ b/web/src/constants/knowledge.ts @@ -8,12 +8,12 @@ export enum KnowledgeRouteKey { export const DatasetBaseKey = 'dataset'; export enum RunningStatus { - UNSTART = '0', // need to run - RUNNING = '1', // need to cancel - CANCEL = '2', // need to refresh - DONE = '3', // need to refresh - FAIL = '4', // need to refresh - SCHEDULE = '5', + UNSTART = 'UNSTART', // need to run + RUNNING = 'RUNNING', // need to cancel + CANCEL = 'CANCEL', // need to refresh + DONE = 'DONE', // need to refresh + FAIL = 'FAIL', // need to refresh + SCHEDULE = 'SCHEDULE', } export const RunningStatusMap = { diff --git a/web/src/interfaces/database/document.ts b/web/src/interfaces/database/document.ts index eb67490a2ad..9b50e4048dc 100644 --- a/web/src/interfaces/database/document.ts +++ b/web/src/interfaces/database/document.ts @@ -7,7 +7,7 @@ export interface IDocumentInfo { created_by: string; nickname: string; id: string; - kb_id: string; + dataset_id: string; location: string; name: string; parser_config: IParserConfig; diff --git a/web/src/pages/dataset/dataset/use-dataset-table-columns.tsx b/web/src/pages/dataset/dataset/use-dataset-table-columns.tsx index f053fb6672a..076adc63432 100644 --- a/web/src/pages/dataset/dataset/use-dataset-table-columns.tsx +++ b/web/src/pages/dataset/dataset/use-dataset-table-columns.tsx @@ -94,7 +94,7 @@ export function useDatasetTableColumns({ className="flex items-center gap-2 cursor-pointer" onClick={navigateToChunkParsedResult( row.original.id, - row.original.kb_id, + row.original.dataset_id, )} > diff --git a/web/src/pages/dataset/dataset/use-rename-document.ts b/web/src/pages/dataset/dataset/use-rename-document.ts index 95eb7b4e84a..dbebd213ec1 100644 --- a/web/src/pages/dataset/dataset/use-rename-document.ts +++ b/web/src/pages/dataset/dataset/use-rename-document.ts @@ -15,18 +15,18 @@ export const useRenameDocument = () => { const onRenameOk = useCallback( async (name: string) => { - if (record?.id && record?.kb_id) { + if (record?.id && record?.dataset_id) { const ret = await saveName({ documentId: record.id, name, - kbId: record.kb_id, + kbId: record.dataset_id, }); if (ret === 0) { hideRenameModal(); } } }, - [record?.id, record?.kb_id, saveName, hideRenameModal], + [record?.id, record?.dataset_id, saveName, hideRenameModal], ); const handleShow = useCallback( diff --git a/web/src/services/knowledge-service.ts b/web/src/services/knowledge-service.ts index 14c5de613dc..93e0a21dc9c 100644 --- a/web/src/services/knowledge-service.ts +++ b/web/src/services/knowledge-service.ts @@ -241,10 +241,25 @@ export const runRaptor = (datasetId: string) => export const traceRaptor = (datasetId: string) => request.get(api.traceRaptor(datasetId)); +// Using RESTful API: GET /api/v1/datasets/{dataset_id}/documents export const listDocument = ( params?: IFetchKnowledgeListRequestParams, body?: IFetchDocumentListRequestBody, -) => request.post(api.getDocumentList, { data: body || {}, params }); +) => { + if (!params || !params.id) { + throw new Error('params and params.id are required'); + } + // Extract page, page_size, and ext.keywords from params + const { page, page_size, ext } = params; + // Merge: page, page_size, keywords (from ext), body, and remaining params + const mergedParams = { + page, + page_size, + keywords: ext?.keywords, + ...body, + }; + return request.get(api.getDocumentList(params.id), { params: mergedParams }); +}; export const documentFilter = (kb_id: string) => request.post(api.getDatasetFilter, { kb_id }); diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index 618b4b2be8b..0dcf5d8aa3d 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -106,7 +106,8 @@ export default { knowledgeGraph: `${webAPI}/chunk/knowledge_graph`, // document - getDocumentList: `${webAPI}/document/list`, + getDocumentList: (datasetId: string) => + `${restAPIv1}/datasets/${datasetId}/documents`, documentChangeStatus: `${webAPI}/document/change_status`, documentRm: `${webAPI}/document/rm`, documentDelete: `${webAPI}/api/document`,