Skip to content

Commit aafb99c

Browse files
authored
Merge branch 'infiniflow:main' into main
2 parents ddd5898 + 5347295 commit aafb99c

File tree

182 files changed

+11717
-1932
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

182 files changed

+11717
-1932
lines changed

admin/client/parser.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@
8484
| list_user_chats
8585
| create_user_chat
8686
| drop_user_chat
87+
| create_index
88+
| drop_index
89+
| create_doc_meta_index
90+
| drop_doc_meta_index
8791
| list_user_model_providers
8892
| list_user_default_models
8993
| parse_dataset_docs
@@ -176,6 +180,7 @@
176180
INTO: "INTO"i
177181
IN: "IN"i
178182
WITH: "WITH"i
183+
VECTOR_SIZE: "VECTOR_SIZE"i
179184
PARSER: "PARSER"i
180185
PIPELINE: "PIPELINE"i
181186
SEARCH: "SEARCH"i
@@ -197,6 +202,8 @@
197202
LICENSE: "LICENSE"i
198203
CHECK: "CHECK"i
199204
CONFIG: "CONFIG"i
205+
INDEX: "INDEX"i
206+
DOC_META: "DOC_META"i
200207
CHUNK: "CHUNK"i
201208
CHUNKS: "CHUNKS"i
202209
GET: "GET"i
@@ -323,6 +330,10 @@
323330
list_user_chats: LIST CHATS ";"
324331
create_user_chat: CREATE CHAT quoted_string ";"
325332
drop_user_chat: DROP CHAT quoted_string ";"
333+
create_index: CREATE INDEX FOR DATASET quoted_string VECTOR_SIZE NUMBER ";"
334+
drop_index: DROP INDEX FOR DATASET quoted_string ";"
335+
create_doc_meta_index: CREATE INDEX DOC_META ";"
336+
drop_doc_meta_index: DROP INDEX DOC_META ";"
326337
create_chat_session: CREATE CHAT quoted_string SESSION ";"
327338
drop_chat_session: DROP CHAT quoted_string SESSION quoted_string ";"
328339
list_chat_sessions: LIST CHAT quoted_string SESSIONS ";"
@@ -650,6 +661,31 @@ def drop_user_chat(self, items):
650661
chat_name = items[2].children[0].strip("'\"")
651662
return {"type": "drop_user_chat", "chat_name": chat_name}
652663

664+
def create_index(self, items):
665+
# items: CREATE, INDEX, FOR, DATASET, quoted_string, VECTOR_SIZE, NUMBER, ";"
666+
dataset_name = None
667+
vector_size = None
668+
for i, item in enumerate(items):
669+
if hasattr(item, 'data') and item.data == 'quoted_string':
670+
dataset_name = item.children[0].strip("'\"")
671+
if hasattr(item, 'type') and item.type == 'NUMBER':
672+
if i > 0 and items[i-1].type == 'VECTOR_SIZE':
673+
vector_size = int(item)
674+
return {"type": "create_index", "dataset_name": dataset_name, "vector_size": vector_size}
675+
676+
def drop_index(self, items):
677+
dataset_name = None
678+
for item in items:
679+
if hasattr(item, 'data') and item.data == 'quoted_string':
680+
dataset_name = item.children[0].strip("'\"")
681+
return {"type": "drop_index", "dataset_name": dataset_name}
682+
683+
def create_doc_meta_index(self, items):
684+
return {"type": "create_doc_meta_index"}
685+
686+
def drop_doc_meta_index(self, items):
687+
return {"type": "drop_doc_meta_index"}
688+
653689
def list_user_model_providers(self, items):
654690
return {"type": "list_user_model_providers"}
655691

admin/client/ragflow_client.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,75 @@ def create_user_chat(self, command):
10801080
else:
10811081
print(f"Fail to create chat {chat_name}, code: {res_json['code']}, message: {res_json['message']}")
10821082

1083+
def create_index(self, command):
1084+
if self.server_type != "user":
1085+
print("This command is only allowed in USER mode")
1086+
return
1087+
dataset_name = command["dataset_name"]
1088+
vector_size = command.get("vector_size")
1089+
if not vector_size:
1090+
print("vector_size is required")
1091+
return
1092+
# Get dataset ID by name
1093+
dataset_id = self._get_dataset_id(dataset_name)
1094+
if dataset_id is None:
1095+
return
1096+
# Build payload
1097+
payload = {"kb_id": dataset_id, "vector_size": vector_size}
1098+
# Call API
1099+
response = self.http_client.request("POST", "/kb/index", json_body=payload,
1100+
use_api_base=False, auth_kind="web")
1101+
res_json = response.json()
1102+
if response.status_code == 200 and res_json.get("code") == 0:
1103+
print(f"Success to create index for dataset: {dataset_name}")
1104+
else:
1105+
print(f"Fail to create index for dataset {dataset_name}, code: {res_json.get('code')}, message: {res_json.get('message')}")
1106+
1107+
def drop_index(self, command):
1108+
if self.server_type != "user":
1109+
print("This command is only allowed in USER mode")
1110+
return
1111+
dataset_name = command["dataset_name"]
1112+
# Get dataset ID by name
1113+
dataset_id = self._get_dataset_id(dataset_name)
1114+
if dataset_id is None:
1115+
return
1116+
# Call API to delete index
1117+
payload = {"kb_id": dataset_id}
1118+
response = self.http_client.request("DELETE", "/kb/index", json_body=payload,
1119+
use_api_base=False, auth_kind="web")
1120+
res_json = response.json()
1121+
if response.status_code == 200 and res_json.get("code") == 0:
1122+
print(f"Success to drop index for dataset: {dataset_name}")
1123+
else:
1124+
print(f"Fail to drop index for dataset {dataset_name}, code: {res_json.get('code')}, message: {res_json.get('message')}")
1125+
1126+
def create_doc_meta_index(self, command):
1127+
if self.server_type != "user":
1128+
print("This command is only allowed in USER mode")
1129+
return
1130+
# Call API to create doc meta index
1131+
response = self.http_client.request("POST", "/tenant/doc_meta_index",
1132+
use_api_base=False, auth_kind="web")
1133+
res_json = response.json()
1134+
if response.status_code == 200 and res_json.get("code") == 0:
1135+
print("Success to create doc meta index")
1136+
else:
1137+
print(f"Fail to create doc meta index, code: {res_json.get('code')}, message: {res_json.get('message')}")
1138+
1139+
def drop_doc_meta_index(self, command):
1140+
if self.server_type != "user":
1141+
print("This command is only allowed in USER mode")
1142+
return
1143+
# Call API to delete doc meta index
1144+
response = self.http_client.request("DELETE", "/tenant/doc_meta_index",
1145+
use_api_base=False, auth_kind="web")
1146+
res_json = response.json()
1147+
if response.status_code == 200 and res_json.get("code") == 0:
1148+
print("Success to drop doc meta index")
1149+
else:
1150+
print(f"Fail to drop doc meta index, code: {res_json.get('code')}, message: {res_json.get('message')}")
1151+
10831152
def drop_user_chat(self, command):
10841153
if self.server_type != "user":
10851154
print("This command is only allowed in USER mode")
@@ -1804,6 +1873,14 @@ def run_command(client: RAGFlowClient, command_dict: dict):
18041873
client.create_user_chat(command_dict)
18051874
case "drop_user_chat":
18061875
client.drop_user_chat(command_dict)
1876+
case "create_index":
1877+
client.create_index(command_dict)
1878+
case "drop_index":
1879+
client.drop_index(command_dict)
1880+
case "create_doc_meta_index":
1881+
client.create_doc_meta_index(command_dict)
1882+
case "drop_doc_meta_index":
1883+
client.drop_doc_meta_index(command_dict)
18071884
case "create_chat_session":
18081885
client.create_chat_session(command_dict)
18091886
case "drop_chat_session":
@@ -1887,6 +1964,10 @@ def show_help():
18871964
LIST METADATA SUMMARY OF DATASET <dataset> DOCUMENTS <doc_id>[, <doc_id>]*
18881965
GET CHUNK <chunk_id>
18891966
LIST CHUNKS OF DOCUMENT <doc_id> [PAGE <page>] [SIZE <size>] [KEYWORDS <keywords>] [AVAILABLE <0|1>]
1967+
CREATE INDEX FOR DATASET <dataset> VECTOR_SIZE <vector_size>
1968+
DROP INDEX FOR DATASET <dataset>
1969+
CREATE INDEX DOC_META
1970+
DROP INDEX DOC_META
18901971
18911972
Meta Commands:
18921973
\\?, \\h, \\help Show this help

api/apps/chunk_app.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ async def set():
155155
d["question_kwd"] = req["question_kwd"]
156156
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
157157
if "tag_kwd" in req:
158+
if not isinstance(req["tag_kwd"], list):
159+
return get_data_error_result(message="`tag_kwd` should be a list")
160+
if not all(isinstance(t, str) for t in req["tag_kwd"]):
161+
return get_data_error_result(message="`tag_kwd` must be a list of strings")
158162
d["tag_kwd"] = req["tag_kwd"]
159163
if "tag_feas" in req:
160164
d["tag_feas"] = req["tag_feas"]
@@ -317,6 +321,12 @@ async def create():
317321
d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
318322
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
319323
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
324+
if "tag_kwd" in req:
325+
if not isinstance(req["tag_kwd"], list):
326+
return get_data_error_result(message="`tag_kwd` is required to be a list")
327+
if not all(isinstance(t, str) for t in req["tag_kwd"]):
328+
return get_data_error_result(message="`tag_kwd` must be a list of strings")
329+
d["tag_kwd"] = req["tag_kwd"]
320330
if "tag_feas" in req:
321331
d["tag_feas"] = req["tag_feas"]
322332
image_base64 = req.get("image_base64", None)

api/apps/file2document_app.py

Lines changed: 75 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
# limitations under the License
1515
#
1616

17+
import asyncio
18+
import logging
1719
from pathlib import Path
1820

1921
from api.db.services.file2document_service import File2DocumentService
@@ -28,73 +30,92 @@
2830
from api.db.services.document_service import DocumentService
2931

3032

33+
def _convert_files(file_ids, kb_ids, user_id):
34+
"""Synchronous worker: delete old docs and insert new ones for the given file/kb pairs."""
35+
for id in file_ids:
36+
informs = File2DocumentService.get_by_file_id(id)
37+
for inform in informs:
38+
doc_id = inform.document_id
39+
e, doc = DocumentService.get_by_id(doc_id)
40+
if not e:
41+
continue
42+
tenant_id = DocumentService.get_tenant_id(doc_id)
43+
if not tenant_id:
44+
logging.warning("tenant_id not found for doc_id=%s, skipping remove_document", doc_id)
45+
continue
46+
DocumentService.remove_document(doc, tenant_id)
47+
File2DocumentService.delete_by_file_id(id)
48+
49+
e, file = FileService.get_by_id(id)
50+
if not e:
51+
continue
52+
53+
for kb_id in kb_ids:
54+
e, kb = KnowledgebaseService.get_by_id(kb_id)
55+
if not e:
56+
continue
57+
doc = DocumentService.insert({
58+
"id": get_uuid(),
59+
"kb_id": kb.id,
60+
"parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
61+
"pipeline_id": kb.pipeline_id,
62+
"parser_config": kb.parser_config,
63+
"created_by": user_id,
64+
"type": file.type,
65+
"name": file.name,
66+
"suffix": Path(file.name).suffix.lstrip("."),
67+
"location": file.location,
68+
"size": file.size
69+
})
70+
File2DocumentService.insert({
71+
"id": get_uuid(),
72+
"file_id": id,
73+
"document_id": doc.id,
74+
})
75+
76+
3177
@manager.route('/convert', methods=['POST']) # noqa: F821
3278
@login_required
3379
@validate_request("file_ids", "kb_ids")
3480
async def convert():
3581
req = await get_request_json()
3682
kb_ids = req["kb_ids"]
3783
file_ids = req["file_ids"]
38-
file2documents = []
3984

4085
try:
4186
files = FileService.get_by_ids(file_ids)
42-
files_set = dict({file.id: file for file in files})
87+
files_set = {file.id: file for file in files}
88+
89+
# Validate all files exist before starting any work
4390
for file_id in file_ids:
44-
file = files_set[file_id]
45-
if not file:
91+
if not files_set.get(file_id):
4692
return get_data_error_result(message="File not found!")
47-
file_ids_list = [file_id]
93+
94+
# Validate all kb_ids exist before scheduling background work
95+
for kb_id in kb_ids:
96+
e, _ = KnowledgebaseService.get_by_id(kb_id)
97+
if not e:
98+
return get_data_error_result(message="Can't find this dataset!")
99+
100+
# Expand folders to their innermost file IDs
101+
all_file_ids = []
102+
for file_id in file_ids:
103+
file = files_set[file_id]
48104
if file.type == FileType.FOLDER.value:
49-
file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
50-
for id in file_ids_list:
51-
informs = File2DocumentService.get_by_file_id(id)
52-
# delete
53-
for inform in informs:
54-
doc_id = inform.document_id
55-
e, doc = DocumentService.get_by_id(doc_id)
56-
if not e:
57-
return get_data_error_result(message="Document not found!")
58-
tenant_id = DocumentService.get_tenant_id(doc_id)
59-
if not tenant_id:
60-
return get_data_error_result(message="Tenant not found!")
61-
if not DocumentService.remove_document(doc, tenant_id):
62-
return get_data_error_result(
63-
message="Database error (Document removal)!")
64-
File2DocumentService.delete_by_file_id(id)
65-
66-
# insert
67-
for kb_id in kb_ids:
68-
e, kb = KnowledgebaseService.get_by_id(kb_id)
69-
if not e:
70-
return get_data_error_result(
71-
message="Can't find this dataset!")
72-
e, file = FileService.get_by_id(id)
73-
if not e:
74-
return get_data_error_result(
75-
message="Can't find this file!")
76-
77-
doc = DocumentService.insert({
78-
"id": get_uuid(),
79-
"kb_id": kb.id,
80-
"parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
81-
"pipeline_id": kb.pipeline_id,
82-
"parser_config": kb.parser_config,
83-
"created_by": current_user.id,
84-
"type": file.type,
85-
"name": file.name,
86-
"suffix": Path(file.name).suffix.lstrip("."),
87-
"location": file.location,
88-
"size": file.size
89-
})
90-
file2document = File2DocumentService.insert({
91-
"id": get_uuid(),
92-
"file_id": id,
93-
"document_id": doc.id,
94-
})
95-
96-
file2documents.append(file2document.to_json())
97-
return get_json_result(data=file2documents)
105+
all_file_ids.extend(FileService.get_all_innermost_file_ids(file_id, []))
106+
else:
107+
all_file_ids.append(file_id)
108+
109+
user_id = current_user.id
110+
# Run the blocking DB work in a thread so the event loop is not blocked.
111+
# For large folders this prevents 504 Gateway Timeout by returning as
112+
# soon as the background task is scheduled.
113+
loop = asyncio.get_running_loop()
114+
future = loop.run_in_executor(None, _convert_files, all_file_ids, kb_ids, user_id)
115+
future.add_done_callback(
116+
lambda f: logging.error("_convert_files failed: %s", f.exception()) if f.exception() else None
117+
)
118+
return get_json_result(data=True)
98119
except Exception as e:
99120
return server_error_response(e)
100121

0 commit comments

Comments
 (0)