infiniflow
diff --git a/‎admin/client/parser.py‎
Lines changed: 36 additions & 0 deletions b/‎admin/client/parser.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎admin/client/ragflow_client.py‎
Lines changed: 81 additions & 0 deletions b/‎admin/client/ragflow_client.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎api/apps/chunk_app.py‎
Lines changed: 10 additions & 0 deletions b/‎api/apps/chunk_app.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎api/apps/file2document_app.py‎
Lines changed: 75 additions & 54 deletions b/‎api/apps/file2document_app.py‎
Lines changed: 75 additions & 54 deletions
@@ -84,6 +84,10 @@
            | list_user_chats
            | create_user_chat
            | drop_user_chat
+           | create_index
+           | drop_index
+           | create_doc_meta_index
+           | drop_doc_meta_index
            | list_user_model_providers
            | list_user_default_models
            | parse_dataset_docs
@@ -176,6 +180,7 @@
 INTO: "INTO"i
 IN: "IN"i
 WITH: "WITH"i
+VECTOR_SIZE: "VECTOR_SIZE"i
 PARSER: "PARSER"i
 PIPELINE: "PIPELINE"i
 SEARCH: "SEARCH"i
@@ -197,6 +202,8 @@
 LICENSE: "LICENSE"i
 CHECK: "CHECK"i
 CONFIG: "CONFIG"i
+INDEX: "INDEX"i
+DOC_META: "DOC_META"i
 CHUNK: "CHUNK"i
 CHUNKS: "CHUNKS"i
 GET: "GET"i
@@ -323,6 +330,10 @@
 list_user_chats: LIST CHATS ";"
 create_user_chat: CREATE CHAT quoted_string ";"
 drop_user_chat: DROP CHAT quoted_string ";"
+create_index: CREATE INDEX FOR DATASET quoted_string VECTOR_SIZE NUMBER ";"
+drop_index: DROP INDEX FOR DATASET quoted_string ";"
+create_doc_meta_index: CREATE INDEX DOC_META ";"
+drop_doc_meta_index: DROP INDEX DOC_META ";"
 create_chat_session: CREATE CHAT quoted_string SESSION ";"
 drop_chat_session: DROP CHAT quoted_string SESSION quoted_string ";"
 list_chat_sessions: LIST CHAT quoted_string SESSIONS ";"
@@ -650,6 +661,31 @@ def drop_user_chat(self, items):
         chat_name = items[2].children[0].strip("'\"")
         return {"type": "drop_user_chat", "chat_name": chat_name}
 
+    def create_index(self, items):
+        # items: CREATE, INDEX, FOR, DATASET, quoted_string, VECTOR_SIZE, NUMBER, ";"
+        dataset_name = None
+        vector_size = None
+        for i, item in enumerate(items):
+            if hasattr(item, 'data') and item.data == 'quoted_string':
+                dataset_name = item.children[0].strip("'\"")
+            if hasattr(item, 'type') and item.type == 'NUMBER':
+                if i > 0 and items[i-1].type == 'VECTOR_SIZE':
+                    vector_size = int(item)
+        return {"type": "create_index", "dataset_name": dataset_name, "vector_size": vector_size}
+
+    def drop_index(self, items):
+        dataset_name = None
+        for item in items:
+            if hasattr(item, 'data') and item.data == 'quoted_string':
+                dataset_name = item.children[0].strip("'\"")
+        return {"type": "drop_index", "dataset_name": dataset_name}
+
+    def create_doc_meta_index(self, items):
+        return {"type": "create_doc_meta_index"}
+
+    def drop_doc_meta_index(self, items):
+        return {"type": "drop_doc_meta_index"}
+
     def list_user_model_providers(self, items):
         return {"type": "list_user_model_providers"}
 
 
@@ -1080,6 +1080,75 @@ def create_user_chat(self, command):
         else:
             print(f"Fail to create chat {chat_name}, code: {res_json['code']}, message: {res_json['message']}")
 
+    def create_index(self, command):
+        if self.server_type != "user":
+            print("This command is only allowed in USER mode")
+            return
+        dataset_name = command["dataset_name"]
+        vector_size = command.get("vector_size")
+        if not vector_size:
+            print("vector_size is required")
+            return
+        # Get dataset ID by name
+        dataset_id = self._get_dataset_id(dataset_name)
+        if dataset_id is None:
+            return
+        # Build payload
+        payload = {"kb_id": dataset_id, "vector_size": vector_size}
+        # Call API
+        response = self.http_client.request("POST", "/kb/index", json_body=payload,
+                                          use_api_base=False, auth_kind="web")
+        res_json = response.json()
+        if response.status_code == 200 and res_json.get("code") == 0:
+            print(f"Success to create index for dataset: {dataset_name}")
+        else:
+            print(f"Fail to create index for dataset {dataset_name}, code: {res_json.get('code')}, message: {res_json.get('message')}")
+
+    def drop_index(self, command):
+        if self.server_type != "user":
+            print("This command is only allowed in USER mode")
+            return
+        dataset_name = command["dataset_name"]
+        # Get dataset ID by name
+        dataset_id = self._get_dataset_id(dataset_name)
+        if dataset_id is None:
+            return
+        # Call API to delete index
+        payload = {"kb_id": dataset_id}
+        response = self.http_client.request("DELETE", "/kb/index", json_body=payload,
+                                          use_api_base=False, auth_kind="web")
+        res_json = response.json()
+        if response.status_code == 200 and res_json.get("code") == 0:
+            print(f"Success to drop index for dataset: {dataset_name}")
+        else:
+            print(f"Fail to drop index for dataset {dataset_name}, code: {res_json.get('code')}, message: {res_json.get('message')}")
+
+    def create_doc_meta_index(self, command):
+        if self.server_type != "user":
+            print("This command is only allowed in USER mode")
+            return
+        # Call API to create doc meta index
+        response = self.http_client.request("POST", "/tenant/doc_meta_index",
+                                          use_api_base=False, auth_kind="web")
+        res_json = response.json()
+        if response.status_code == 200 and res_json.get("code") == 0:
+            print("Success to create doc meta index")
+        else:
+            print(f"Fail to create doc meta index, code: {res_json.get('code')}, message: {res_json.get('message')}")
+
+    def drop_doc_meta_index(self, command):
+        if self.server_type != "user":
+            print("This command is only allowed in USER mode")
+            return
+        # Call API to delete doc meta index
+        response = self.http_client.request("DELETE", "/tenant/doc_meta_index",
+                                          use_api_base=False, auth_kind="web")
+        res_json = response.json()
+        if response.status_code == 200 and res_json.get("code") == 0:
+            print("Success to drop doc meta index")
+        else:
+            print(f"Fail to drop doc meta index, code: {res_json.get('code')}, message: {res_json.get('message')}")
+
     def drop_user_chat(self, command):
         if self.server_type != "user":
             print("This command is only allowed in USER mode")
@@ -1804,6 +1873,14 @@ def run_command(client: RAGFlowClient, command_dict: dict):
             client.create_user_chat(command_dict)
         case "drop_user_chat":
             client.drop_user_chat(command_dict)
+        case "create_index":
+            client.create_index(command_dict)
+        case "drop_index":
+            client.drop_index(command_dict)
+        case "create_doc_meta_index":
+            client.create_doc_meta_index(command_dict)
+        case "drop_doc_meta_index":
+            client.drop_doc_meta_index(command_dict)
         case "create_chat_session":
             client.create_chat_session(command_dict)
         case "drop_chat_session":
@@ -1887,6 +1964,10 @@ def show_help():
 LIST METADATA SUMMARY OF DATASET <dataset> DOCUMENTS <doc_id>[, <doc_id>]*
 GET CHUNK <chunk_id>
 LIST CHUNKS OF DOCUMENT <doc_id> [PAGE <page>] [SIZE <size>] [KEYWORDS <keywords>] [AVAILABLE <0|1>]
+CREATE INDEX FOR DATASET <dataset> VECTOR_SIZE <vector_size>
+DROP INDEX FOR DATASET <dataset>
+CREATE INDEX DOC_META
+DROP INDEX DOC_META
 
 Meta Commands:
 \\?, \\h, \\help     Show this help
 
@@ -155,6 +155,10 @@ async def set():
         d["question_kwd"] = req["question_kwd"]
         d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
     if "tag_kwd" in req:
+        if not isinstance(req["tag_kwd"], list):
+            return get_data_error_result(message="`tag_kwd` should be a list")
+        if not all(isinstance(t, str) for t in req["tag_kwd"]):
+            return get_data_error_result(message="`tag_kwd` must be a list of strings")
         d["tag_kwd"] = req["tag_kwd"]
     if "tag_feas" in req:
         d["tag_feas"] = req["tag_feas"]
@@ -317,6 +321,12 @@ async def create():
     d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
     d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
     d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
+    if "tag_kwd" in req:
+        if not isinstance(req["tag_kwd"], list):
+            return get_data_error_result(message="`tag_kwd` is required to be a list")
+        if not all(isinstance(t, str) for t in req["tag_kwd"]):
+            return get_data_error_result(message="`tag_kwd` must be a list of strings")
+        d["tag_kwd"] = req["tag_kwd"]
     if "tag_feas" in req:
         d["tag_feas"] = req["tag_feas"]
     image_base64 = req.get("image_base64", None)
 
@@ -14,6 +14,8 @@
 #  limitations under the License
 #
 
+import asyncio
+import logging
 from pathlib import Path
 
 from api.db.services.file2document_service import File2DocumentService
@@ -28,73 +30,92 @@
 from api.db.services.document_service import DocumentService
 
 
+def _convert_files(file_ids, kb_ids, user_id):
+    """Synchronous worker: delete old docs and insert new ones for the given file/kb pairs."""
+    for id in file_ids:
+        informs = File2DocumentService.get_by_file_id(id)
+        for inform in informs:
+            doc_id = inform.document_id
+            e, doc = DocumentService.get_by_id(doc_id)
+            if not e:
+                continue
+            tenant_id = DocumentService.get_tenant_id(doc_id)
+            if not tenant_id:
+                logging.warning("tenant_id not found for doc_id=%s, skipping remove_document", doc_id)
+                continue
+            DocumentService.remove_document(doc, tenant_id)
+        File2DocumentService.delete_by_file_id(id)
+
+        e, file = FileService.get_by_id(id)
+        if not e:
+            continue
+
+        for kb_id in kb_ids:
+            e, kb = KnowledgebaseService.get_by_id(kb_id)
+            if not e:
+                continue
+            doc = DocumentService.insert({
+                "id": get_uuid(),
+                "kb_id": kb.id,
+                "parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
+                "pipeline_id": kb.pipeline_id,
+                "parser_config": kb.parser_config,
+                "created_by": user_id,
+                "type": file.type,
+                "name": file.name,
+                "suffix": Path(file.name).suffix.lstrip("."),
+                "location": file.location,
+                "size": file.size
+            })
+            File2DocumentService.insert({
+                "id": get_uuid(),
+                "file_id": id,
+                "document_id": doc.id,
+            })
+
+
 @manager.route('/convert', methods=['POST'])  # noqa: F821
 @login_required
 @validate_request("file_ids", "kb_ids")
 async def convert():
     req = await get_request_json()
     kb_ids = req["kb_ids"]
     file_ids = req["file_ids"]
-    file2documents = []
 
     try:
         files = FileService.get_by_ids(file_ids)
-        files_set = dict({file.id: file for file in files})
+        files_set = {file.id: file for file in files}
+
+        # Validate all files exist before starting any work
         for file_id in file_ids:
-            file = files_set[file_id]
-            if not file:
+            if not files_set.get(file_id):
                 return get_data_error_result(message="File not found!")
-            file_ids_list = [file_id]
+
+        # Validate all kb_ids exist before scheduling background work
+        for kb_id in kb_ids:
+            e, _ = KnowledgebaseService.get_by_id(kb_id)
+            if not e:
+                return get_data_error_result(message="Can't find this dataset!")
+
+        # Expand folders to their innermost file IDs
+        all_file_ids = []
+        for file_id in file_ids:
+            file = files_set[file_id]
             if file.type == FileType.FOLDER.value:
-                file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
-            for id in file_ids_list:
-                informs = File2DocumentService.get_by_file_id(id)
-                # delete
-                for inform in informs:
-                    doc_id = inform.document_id
-                    e, doc = DocumentService.get_by_id(doc_id)
-                    if not e:
-                        return get_data_error_result(message="Document not found!")
-                    tenant_id = DocumentService.get_tenant_id(doc_id)
-                    if not tenant_id:
-                        return get_data_error_result(message="Tenant not found!")
-                    if not DocumentService.remove_document(doc, tenant_id):
-                        return get_data_error_result(
-                            message="Database error (Document removal)!")
-                File2DocumentService.delete_by_file_id(id)
-
-                # insert
-                for kb_id in kb_ids:
-                    e, kb = KnowledgebaseService.get_by_id(kb_id)
-                    if not e:
-                        return get_data_error_result(
-                            message="Can't find this dataset!")
-                    e, file = FileService.get_by_id(id)
-                    if not e:
-                        return get_data_error_result(
-                            message="Can't find this file!")
-
-                    doc = DocumentService.insert({
-                        "id": get_uuid(),
-                        "kb_id": kb.id,
-                        "parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
-                        "pipeline_id": kb.pipeline_id,
-                        "parser_config": kb.parser_config,
-                        "created_by": current_user.id,
-                        "type": file.type,
-                        "name": file.name,
-                        "suffix": Path(file.name).suffix.lstrip("."),
-                        "location": file.location,
-                        "size": file.size
-                    })
-                    file2document = File2DocumentService.insert({
-                        "id": get_uuid(),
-                        "file_id": id,
-                        "document_id": doc.id,
-                    })
-
-                    file2documents.append(file2document.to_json())
-        return get_json_result(data=file2documents)
+                all_file_ids.extend(FileService.get_all_innermost_file_ids(file_id, []))
+            else:
+                all_file_ids.append(file_id)
+
+        user_id = current_user.id
+        # Run the blocking DB work in a thread so the event loop is not blocked.
+        # For large folders this prevents 504 Gateway Timeout by returning as
+        # soon as the background task is scheduled.
+        loop = asyncio.get_running_loop()
+        future = loop.run_in_executor(None, _convert_files, all_file_ids, kb_ids, user_id)
+        future.add_done_callback(
+            lambda f: logging.error("_convert_files failed: %s", f.exception()) if f.exception() else None
+        )
+        return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)