Skip to content

Commit 2d05475

Browse files
authored
Refactor: Consolidation WEB API & HTTP API for document infos (infiniflow#14239)
### What problem does this PR solve? Before consolidation Web API: POST /v1/document/infos Http API - GET /api/v1/datasets/<dataset_id>/documents After consolidation, Restful API -- GET /api/v1/datasets/<dataset_id>/documents?ids=id1&ids=id2 ### Type of change - [ ] Refactoring
1 parent 779dead commit 2d05475

9 files changed

Lines changed: 60 additions & 59 deletions

File tree

api/apps/document_app.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -183,22 +183,6 @@ async def create():
183183
return server_error_response(e)
184184

185185

186-
@manager.route("/infos", methods=["POST"]) # noqa: F821
187-
@login_required
188-
async def doc_infos():
189-
req = await get_request_json()
190-
doc_ids = req["doc_ids"]
191-
for doc_id in doc_ids:
192-
if not DocumentService.accessible(doc_id, current_user.id):
193-
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
194-
docs = DocumentService.get_by_ids(doc_ids)
195-
docs_list = list(docs.dicts())
196-
# Add meta_fields for each document
197-
for doc in docs_list:
198-
doc["meta_fields"] = DocMetadataService.get_document_metadata(doc["id"])
199-
return get_json_result(data=docs_list)
200-
201-
202186
@manager.route("/metadata/update", methods=["POST"]) # noqa: F821
203187
@login_required
204188
@validate_request("doc_ids")

api/apps/restful_apis/document_api.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,12 @@ def _get_docs_with_request(req, dataset_id:str):
527527
if doc_name and not DocumentService.query(name=doc_name, kb_id=dataset_id):
528528
return RetCode.DATA_ERROR, f"You don't own the document {doc_name}.", [], 0
529529

530+
doc_ids = q.getlist("ids")
531+
if doc_id and len(doc_ids) > 0:
532+
return RetCode.DATA_ERROR, f"Should not provide both 'id':{doc_id} and 'ids'{doc_ids}"
533+
if len(doc_ids) > 0:
534+
doc_ids_filter = doc_ids
535+
530536
docs, total = DocumentService.get_by_kb_id(dataset_id, page, page_size, orderby, desc, keywords, run_status_converted, types, suffix,
531537
name=doc_name, doc_ids=doc_ids_filter, return_empty_metadata=return_empty_metadata)
532538

sdk/python/ragflow_sdk/modules/dataset.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def upload_documents(self, document_list: list[dict]):
6666
def list_documents(
6767
self,
6868
id: str | None = None,
69+
ids: list[str] | None = None,
6970
name: str | None = None,
7071
keywords: str | None = None,
7172
page: int = 1,
@@ -75,6 +76,10 @@ def list_documents(
7576
create_time_from: int = 0,
7677
create_time_to: int = 0,
7778
):
79+
# Validate that id and ids are not used together
80+
if id and ids:
81+
raise ValueError("Cannot use both 'id' and 'ids' parameters at the same time.")
82+
7883
params = {
7984
"id": id,
8085
"name": name,
@@ -86,6 +91,10 @@ def list_documents(
8691
"create_time_from": create_time_from,
8792
"create_time_to": create_time_to,
8893
}
94+
# Handle ids parameter - convert to multiple query params
95+
if ids:
96+
for doc_id in ids:
97+
params.append(("ids", doc_id))
8998
res = self.get(f"/datasets/{self.id}/documents", params=params)
9099
res = res.json()
91100
documents = []

sdk/python/test/test_frontend_api/common.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,36 @@ def list_document(auth, dataset_id):
7575
return res.json()
7676

7777

78-
def get_docs_info(auth, doc_ids):
78+
def get_docs_info(auth, dataset_id, doc_ids=None, doc_id=None):
79+
"""
80+
Get document information by IDs.
81+
82+
Args:
83+
auth: Authorization header
84+
dataset_id: Dataset ID
85+
doc_ids: List of document IDs (use for multiple) - exclusive with doc_id
86+
doc_id: Single document ID (use for one) - exclusive with doc_ids
87+
88+
Raises:
89+
ValueError: If both doc_id and doc_ids are provided
90+
"""
91+
# Validate that id and ids are not used together
92+
if doc_id and doc_ids:
93+
raise ValueError("Cannot use both 'id' and 'ids' parameters at the same time.")
94+
7995
authorization = {"Authorization": auth}
80-
json_req = {"doc_ids": doc_ids}
81-
url = f"{HOST_ADDRESS}/v1/document/infos"
82-
res = requests.post(url=url, headers=authorization, json=json_req)
96+
params = {}
97+
if doc_ids:
98+
# Multiple IDs
99+
for id in doc_ids:
100+
params.append(("ids", id))
101+
elif doc_id:
102+
# Single ID
103+
params["id"] = doc_id
104+
105+
# Use /api/v1 prefix for dataset API
106+
url = f"{HOST_ADDRESS}/api/v1/datasets/{dataset_id}/documents"
107+
res = requests.get(url=url, headers=authorization, params=params)
83108
return res.json()
84109

85110

sdk/python/test/test_frontend_api/test_chunk.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,14 @@ def test_parse_txt_document(get_auth):
4848
for doc in res['data']['docs']:
4949
doc_id_list.append(doc['id'])
5050

51-
res = get_docs_info(get_auth, doc_id_list)
51+
res = get_docs_info(get_auth, dataset_id, doc_ids=doc_id_list)
5252
print(doc_id_list)
5353
doc_count = len(doc_id_list)
5454
res = parse_docs(get_auth, doc_id_list)
5555

5656
start_ts = timer()
5757
while True:
58-
res = get_docs_info(get_auth, doc_id_list)
58+
res = get_docs_info(get_auth, dataset_id, doc_ids=doc_id_list)
5959
finished_count = 0
6060
for doc_info in res['data']:
6161
if doc_info['progress'] == 1:

test/testcases/test_web_api/test_common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,8 @@ def document_filter(auth, dataset_id, payload=None, *, headers=HEADERS, data=Non
397397
return res.json()
398398

399399

400-
def document_infos(auth, payload=None, *, headers=HEADERS, data=None):
401-
res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/infos", headers=headers, auth=auth, json=payload, data=data)
400+
def document_infos(auth, dataset_id, params=None, payload=None, *, headers=HEADERS, data=None):
401+
res = requests.get(url=f"{HOST_ADDRESS}{DATASETS_URL}/{dataset_id}/documents", params=params, json=payload, headers=headers, auth=auth, data=data)
402402
return res.json()
403403

404404

test/testcases/test_web_api/test_document_app/test_document_metadata.py

Lines changed: 12 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def test_filter_auth_invalid(self, invalid_auth, expected_code, expected_fragmen
4444
@pytest.mark.p2
4545
@pytest.mark.parametrize("invalid_auth, expected_code, expected_fragment", INVALID_AUTH_CASES)
4646
def test_infos_auth_invalid(self, invalid_auth, expected_code, expected_fragment):
47-
res = document_infos(invalid_auth, {"doc_ids": ["doc_id"]})
47+
res = document_infos(invalid_auth, "kb_id", {"doc_ids": ["doc_id"]})
4848
assert res["code"] == expected_code, res
4949
assert expected_fragment in res["message"], res
5050

@@ -91,11 +91,12 @@ def test_filter(self, WebApiAuth, add_dataset_func):
9191

9292
@pytest.mark.p2
9393
def test_infos(self, WebApiAuth, add_document_func):
94-
_, doc_id = add_document_func
95-
res = document_infos(WebApiAuth, {"doc_ids": [doc_id]})
94+
dataset_id, doc_id = add_document_func
95+
res = document_infos(WebApiAuth, dataset_id, {"ids": [doc_id]})
9696
assert res["code"] == 0, res
97-
assert len(res["data"]) == 1, res
98-
assert res["data"][0]["id"] == doc_id, res
97+
docs = res["data"]["docs"]
98+
assert len(docs) == 1, docs
99+
assert docs[0]["id"] == doc_id, res
99100

100101
## The inputs has been changed to add 'doc_ids'
101102
## TODO:
@@ -138,20 +139,22 @@ def test_infos(self, WebApiAuth, add_document_func):
138139

139140
@pytest.mark.p2
140141
def test_change_status(self, WebApiAuth, add_document_func):
141-
_, doc_id = add_document_func
142+
dataset_id, doc_id = add_document_func
142143
res = document_change_status(WebApiAuth, {"doc_ids": [doc_id], "status": "1"})
144+
143145
assert res["code"] == 0, res
144146
assert res["data"][doc_id]["status"] == "1", res
145-
info_res = document_infos(WebApiAuth, {"doc_ids": [doc_id]})
147+
info_res = document_infos(WebApiAuth, dataset_id, {"ids": [doc_id]})
148+
146149
assert info_res["code"] == 0, info_res
147-
assert info_res["data"][0]["status"] == "1", info_res
150+
assert info_res["data"]["docs"][0]["status"] == "1", info_res
148151

149152

150153
class TestDocumentMetadataNegative:
151154
@pytest.mark.p2
152155
def test_filter_missing_kb_id(self, WebApiAuth, add_document_func):
153156
kb_id, doc_id = add_document_func
154-
res = document_filter(WebApiAuth, "", {"doc_ids": [doc_id]})
157+
res = document_filter(WebApiAuth, "", {"ids": [doc_id]})
155158
assert res["code"] == 100, res
156159
assert "<MethodNotAllowed '405: Method Not Allowed'>" == res["message"], res
157160

@@ -228,26 +231,6 @@ def _allow_kb(self, module, monkeypatch, kb_id="kb1", tenant_id="tenant1"):
228231
monkeypatch.setattr(module.UserTenantService, "query", lambda **_kwargs: [SimpleNamespace(tenant_id=tenant_id)])
229232
monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: True if _kwargs.get("id") == kb_id else False)
230233

231-
232-
def test_infos_meta_fields(self, document_app_module, monkeypatch):
233-
module = document_app_module
234-
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True)
235-
236-
class _Docs:
237-
def dicts(self):
238-
return [{"id": "doc1"}]
239-
240-
monkeypatch.setattr(module.DocumentService, "get_by_ids", lambda _ids: _Docs())
241-
monkeypatch.setattr(module.DocMetadataService, "get_document_metadata", lambda _doc_id: {"author": "alice"})
242-
243-
async def fake_request_json():
244-
return {"doc_ids": ["doc1"]}
245-
246-
monkeypatch.setattr(module, "get_request_json", fake_request_json)
247-
res = _run(module.doc_infos())
248-
assert res["code"] == 0
249-
assert res["data"][0]["meta_fields"]["author"] == "alice"
250-
251234
def test_metadata_update_missing_kb_id(self, document_app_module, monkeypatch):
252235
module = document_app_module
253236

web/src/services/knowledge-service.ts

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ const {
3434
documentUpload,
3535
webCrawl,
3636
knowledgeGraph,
37-
documentInfos,
3837
listTagByKnowledgeIds,
3938
setMeta,
4039
getMeta,
@@ -101,10 +100,6 @@ const methods = {
101100
url: webCrawl,
102101
method: 'post',
103102
},
104-
documentInfos: {
105-
url: documentInfos,
106-
method: 'post',
107-
},
108103
setMeta: {
109104
url: setMeta,
110105
method: 'post',

web/src/utils/api.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ export default {
123123
documentUpload: (datasetId: string) =>
124124
`${restAPIv1}/datasets/${datasetId}/documents`,
125125
webCrawl: `${webAPI}/document/web_crawl`,
126-
documentInfos: `${webAPI}/document/infos`,
127126
uploadAndParse: `${webAPI}/document/upload_info`,
128127
setMeta: `${webAPI}/document/set_meta`,
129128
getDatasetFilter: (datasetId: string) =>

0 commit comments

Comments
 (0)