Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions admin/client/ragflow_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -1612,7 +1612,7 @@ def set_metadata(self, command_dict):
print(f"no document found for {doc_id}")
return

dataset_id = docs[0].get("kb_id")
dataset_id = docs[0].get("dataset_id")
if not dataset_id:
print(f"Dataset ID not found for document: {doc_id}")
return
Expand Down Expand Up @@ -1753,7 +1753,7 @@ def _wait_parse_done(self, dataset_name: str, dataset_id: str):
return False
all_done = True
for doc in docs:
if doc.get("run") != "3":
if doc.get("run") != "DONE":
print(f"Document {doc["name"]} is not done, status: {doc.get("run")}")
all_done = False
break
Expand All @@ -1764,8 +1764,13 @@ def _wait_parse_done(self, dataset_name: str, dataset_id: str):
time.sleep(0.5)

def _list_documents(self, dataset_name: str, dataset_id: str):
response = self.http_client.request("POST", f"/document/list?id={dataset_id}", use_api_base=False,
auth_kind="web")
# Use the new RESTful API: GET /api/v1/datasets/<dataset_id>/documents
response = self.http_client.request(
"GET",
f"/datasets/{dataset_id}/documents",
use_api_base=True,
auth_kind="web"
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
res_json = response.json()
if response.status_code != 200:
print(
Expand Down
134 changes: 0 additions & 134 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
from common import settings
from common.constants import SANDBOX_ARTIFACT_BUCKET, VALID_TASK_STATUS, ParserType, RetCode, TaskStatus
from common.file_utils import get_project_base_directory
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import get_uuid, thread_pool_exec
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
Expand Down Expand Up @@ -185,139 +184,6 @@ async def create():
return server_error_response(e)


@manager.route("/list", methods=["POST"]) # noqa: F821
@login_required
async def list_docs():
kb_id = request.args.get("id")
if not kb_id:
return get_json_result(data=False, message='Dataset ID is required for listing files.', code=RetCode.ARGUMENT_ERROR)
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(data=False, message="Only owner of dataset authorized for this operation.", code=RetCode.OPERATING_ERROR)
keywords = request.args.get("keywords", "")

page_number = int(request.args.get("page", 0))
items_per_page = int(request.args.get("page_size", 0))
orderby = request.args.get("orderby", "create_time")
if request.args.get("desc", "true").lower() == "false":
desc = False
else:
desc = True
create_time_from = int(request.args.get("create_time_from", 0))
create_time_to = int(request.args.get("create_time_to", 0))

req = await get_request_json()

return_empty_metadata = req.get("return_empty_metadata", False)
if isinstance(return_empty_metadata, str):
return_empty_metadata = return_empty_metadata.lower() == "true"

run_status = req.get("run_status", [])
if run_status:
invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS}
if invalid_status:
return get_data_error_result(message=f"Invalid filter run status conditions: {', '.join(invalid_status)}")

types = req.get("types", [])
if types:
invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
if invalid_types:
return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}")

suffix = req.get("suffix", [])
metadata_condition = req.get("metadata_condition", {}) or {}
metadata = req.get("metadata", {}) or {}
if isinstance(metadata, dict) and metadata.get("empty_metadata"):
return_empty_metadata = True
metadata = {k: v for k, v in metadata.items() if k != "empty_metadata"}
if return_empty_metadata:
metadata_condition = {}
metadata = {}
else:
if metadata_condition and not isinstance(metadata_condition, dict):
return get_data_error_result(message="metadata_condition must be an object.")
if metadata and not isinstance(metadata, dict):
return get_data_error_result(message="metadata must be an object.")

doc_ids_filter = None
metas = None
if metadata_condition or metadata:
metas = DocMetadataService.get_flatted_meta_by_kbs([kb_id])

if metadata_condition:
doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
if metadata_condition.get("conditions") and not doc_ids_filter:
return get_json_result(data={"total": 0, "docs": []})

if metadata:
metadata_doc_ids = None
for key, values in metadata.items():
if not values:
continue
if not isinstance(values, list):
values = [values]
values = [str(v) for v in values if v is not None and str(v).strip()]
if not values:
continue
key_doc_ids = set()
for value in values:
key_doc_ids.update(metas.get(key, {}).get(value, []))
if metadata_doc_ids is None:
metadata_doc_ids = key_doc_ids
else:
metadata_doc_ids &= key_doc_ids
if not metadata_doc_ids:
return get_json_result(data={"total": 0, "docs": []})
if metadata_doc_ids is not None:
if doc_ids_filter is None:
doc_ids_filter = metadata_doc_ids
else:
doc_ids_filter &= metadata_doc_ids
if not doc_ids_filter:
return get_json_result(data={"total": 0, "docs": []})

if doc_ids_filter is not None:
doc_ids_filter = list(doc_ids_filter)

try:
docs, tol = DocumentService.get_by_kb_id(
kb_id,
page_number,
items_per_page,
orderby,
desc,
keywords,
run_status,
types,
suffix,
doc_ids_filter,
return_empty_metadata=return_empty_metadata,
)

if create_time_from or create_time_to:
filtered_docs = []
for doc in docs:
doc_create_time = doc.get("create_time", 0)
if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to):
filtered_docs.append(doc)
docs = filtered_docs

for doc_item in docs:
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
if doc_item.get("source_type"):
doc_item["source_type"] = doc_item["source_type"].split("/")[0]
if doc_item["parser_config"].get("metadata"):
doc_item["parser_config"]["metadata"] = turn2jsonschema(doc_item["parser_config"]["metadata"])

return get_json_result(data={"total": tol, "docs": docs})
except Exception as e:
return server_error_response(e)


@manager.route("/filter", methods=["POST"]) # noqa: F821
@login_required
async def get_filter():
Expand Down
Loading
Loading