diff --git a/astrbot/core/db/vec_db/faiss_impl/document_storage.py b/astrbot/core/db/vec_db/faiss_impl/document_storage.py
index 58ec8dc1c5..84069ba52f 100644
--- a/astrbot/core/db/vec_db/faiss_impl/document_storage.py
+++ b/astrbot/core/db/vec_db/faiss_impl/document_storage.py
@@ -1,5 +1,6 @@
import json
import os
+from asyncio import Lock
from contextlib import asynccontextmanager
from datetime import datetime
from pathlib import Path
@@ -17,6 +18,7 @@
build_fts5_or_query,
load_stopwords,
to_fts5_search_text,
+ tokenize_text,
)
FTS_TABLE_NAME = "documents_fts"
@@ -58,44 +60,49 @@ def __init__(self, db_path: str) -> None:
self._fts_contentless_delete = False
self._fts_index_ready = False
self._stopwords: set[str] | None = None
+ self._fts_rebuild_lock = Lock()
async def initialize(self) -> None:
"""Initialize the SQLite database and create the documents table if it doesn't exist."""
await self.connect()
async with self.engine.begin() as conn: # type: ignore
await self._ensure_documents_table(conn)
-
- try:
- await conn.execute(
- text(
- "ALTER TABLE documents ADD COLUMN kb_doc_id TEXT "
- "GENERATED ALWAYS AS (json_extract(metadata, '$.kb_doc_id')) STORED",
- ),
- )
- await conn.execute(
- text(
- "ALTER TABLE documents ADD COLUMN user_id TEXT "
- "GENERATED ALWAYS AS (json_extract(metadata, '$.user_id')) STORED",
- ),
- )
-
- # Create indexes
- await conn.execute(
- text(
- "CREATE INDEX IF NOT EXISTS idx_documents_kb_doc_id ON documents(kb_doc_id)",
- ),
- )
- await conn.execute(
- text(
- "CREATE INDEX IF NOT EXISTS idx_documents_user_id ON documents(user_id)",
- ),
- )
- except BaseException:
- pass
+ await self._ensure_generated_columns(conn)
await self._initialize_fts5(conn)
await conn.commit()
+ async def _table_columns(self, executor, table_name: str) -> set[str]:
+ result = await executor.execute(text(f"PRAGMA table_xinfo({table_name})"))
+ return {row[1] for row in result.fetchall()}
+
+ async def _ensure_generated_columns(self, executor) -> None:
+ generated_columns = {
+ "kb_doc_id": "json_extract(metadata, '$.kb_doc_id')",
+ "user_id": "json_extract(metadata, '$.user_id')",
+ "kb_id": "json_extract(metadata, '$.kb_id')",
+ }
+ columns = await self._table_columns(executor, "documents")
+ for column_name, expression in generated_columns.items():
+ if column_name in columns:
+ continue
+ await executor.execute(
+ text(
+ f"ALTER TABLE documents ADD COLUMN {column_name} TEXT "
+ f"GENERATED ALWAYS AS ({expression}) VIRTUAL",
+ ),
+ )
+ columns.add(column_name)
+
+ index_statements = [
+ "CREATE INDEX IF NOT EXISTS idx_documents_kb_doc_id "
+ "ON documents(kb_doc_id)",
+ "CREATE INDEX IF NOT EXISTS idx_documents_user_id ON documents(user_id)",
+ "CREATE INDEX IF NOT EXISTS idx_documents_kb_id ON documents(kb_id)",
+ ]
+ for statement in index_statements:
+ await executor.execute(text(statement))
+
async def _ensure_documents_table(self, executor) -> None:
"""Create the document table from the SQLModel definition."""
result = await executor.execute(
@@ -302,11 +309,11 @@ async def get_documents(
async with self.get_session() as session:
query = select(Document)
-
- for key, val in metadata_filters.items():
- query = query.where(
- text(f"json_extract(metadata, '$.{key}') = :filter_{key}"),
- ).params(**{f"filter_{key}": val})
+ query = await self._apply_metadata_filters(
+ session,
+ query,
+ metadata_filters,
+ )
if ids is not None and len(ids) > 0:
valid_ids = [int(i) for i in ids if i != -1]
@@ -468,11 +475,11 @@ async def delete_documents(self, metadata_filters: dict) -> None:
async with self.get_session() as session, session.begin():
query = select(Document)
-
- for key, val in metadata_filters.items():
- query = query.where(
- text(f"json_extract(metadata, '$.{key}') = :filter_{key}"),
- ).params(**{f"filter_{key}": val})
+ query = await self._apply_metadata_filters(
+ session,
+ query,
+ metadata_filters,
+ )
result = await session.execute(query)
documents = result.scalars().all()
@@ -499,15 +506,144 @@ async def count_documents(self, metadata_filters: dict | None = None) -> int:
query = select(func.count(col(Document.id)))
if metadata_filters:
- for key, val in metadata_filters.items():
- query = query.where(
- text(f"json_extract(metadata, '$.{key}') = :filter_{key}"),
- ).params(**{f"filter_{key}": val})
+ query = await self._apply_metadata_filters(
+ session,
+ query,
+ metadata_filters,
+ )
result = await session.execute(query)
count = result.scalar_one_or_none()
return count if count is not None else 0
+ async def search_documents(
+ self,
+ query_text: str,
+ metadata_filters: dict | None = None,
+ offset: int = 0,
+ limit: int = 100,
+ ) -> tuple[list[dict], int] | None:
+ """Search documents with FTS5 and optional metadata filters.
+
+ Returns None when FTS5 is unavailable so callers can choose whether to
+ fall back to an alternate search strategy.
+ """
+ if limit <= 0:
+ return [], 0
+ if not await self.ensure_fts_index():
+ return None
+
+ match_query = build_fts5_or_query(tokenize_text(query_text, self.stopwords))
+ if not match_query:
+ return [], 0
+
+ metadata_filters = metadata_filters or {}
+ async with self.get_session() as session:
+ filters_sql, filter_params = await self._metadata_filter_sql(
+ session,
+ metadata_filters,
+ table_alias="d",
+ )
+ where_clause = f"{FTS_TABLE_NAME} MATCH :query"
+ if filters_sql:
+ where_clause = f"{where_clause} AND {' AND '.join(filters_sql)}"
+ params = {
+ "query": match_query,
+ "limit": int(limit),
+ "offset": int(offset),
+ **filter_params,
+ }
+ try:
+ count_result = await session.execute(
+ text(
+ f"""
+ SELECT count(*)
+ FROM {FTS_TABLE_NAME}
+ JOIN documents d ON d.id = {FTS_TABLE_NAME}.rowid
+ WHERE {where_clause}
+ """,
+ ),
+ params,
+ )
+ total = int(count_result.scalar_one_or_none() or 0)
+ result = await session.execute(
+ text(
+ f"""
+ SELECT
+ d.id AS id,
+ d.doc_id AS doc_id,
+ d.text AS text,
+ d.metadata AS metadata,
+ d.created_at AS created_at,
+ d.updated_at AS updated_at,
+ bm25({FTS_TABLE_NAME}) AS score
+ FROM {FTS_TABLE_NAME}
+ JOIN documents d ON d.id = {FTS_TABLE_NAME}.rowid
+ WHERE {where_clause}
+ ORDER BY score ASC, d.id ASC
+ LIMIT :limit
+ OFFSET :offset
+ """,
+ ),
+ params,
+ )
+ except Exception as e:
+ logger.warning(
+ f"FTS5 document search failed for {self.db_path}: {e}",
+ )
+ self.fts5_available = False
+ return None
+
+ rows = result.mappings().all()
+ return [
+ {
+ "id": row["id"],
+ "doc_id": row["doc_id"],
+ "text": row["text"],
+ "metadata": row["metadata"],
+ "created_at": row["created_at"],
+ "updated_at": row["updated_at"],
+ "score": float(row["score"]),
+ }
+ for row in rows
+ ], total
+
+ async def _apply_metadata_filters(
+ self,
+ session: AsyncSession,
+ query,
+ metadata_filters: dict,
+ ):
+ filters_sql, params = await self._metadata_filter_sql(
+ session,
+ metadata_filters,
+ )
+ for filter_sql in filters_sql:
+ query = query.where(text(filter_sql))
+ if params:
+ query = query.params(**params)
+ return query
+
+ async def _metadata_filter_sql(
+ self,
+ session: AsyncSession,
+ metadata_filters: dict,
+ table_alias: str | None = None,
+ ) -> tuple[list[str], dict]:
+ columns = await self._table_columns(session, "documents")
+ prefix = f"{table_alias}." if table_alias else ""
+ filters_sql = []
+ params = {}
+ for key, val in metadata_filters.items():
+ if key in {"kb_id", "kb_doc_id", "user_id"} and key in columns:
+ filters_sql.append(f"{prefix}{key} = :filter_{key}")
+ else:
+ filters_sql.append(
+ f"json_extract({prefix}metadata, '$.{key}') = :filter_{key}"
+ )
+ params[f"filter_{key}"] = val
+ return filters_sql, params
+
async def ensure_fts_index(self) -> bool:
"""Ensure the FTS5 sparse index exists and matches the documents table."""
if not self.fts5_available:
@@ -517,22 +653,30 @@ async def ensure_fts_index(self) -> bool:
assert self.engine is not None, "Database connection is not initialized."
- async with self.get_session() as session:
- doc_count = await self._count_documents_in_session(session)
- fts_count = await self._count_fts_rows(session)
- if doc_count == fts_count:
- self._fts_index_ready = True
+ async with self._fts_rebuild_lock:
+ if self._fts_index_ready:
return True
- logger.info(
- f"Rebuilding FTS5 sparse index for {self.db_path}: "
- f"documents={doc_count}, fts_rows={fts_count}",
- )
- await self.rebuild_fts_index()
- return self.fts5_available
+ async with self.get_session() as session:
+ doc_count = await self._count_documents_in_session(session)
+ fts_count = await self._count_fts_rows(session)
+ if doc_count == fts_count:
+ self._fts_index_ready = True
+ return True
+
+ logger.info(
+ f"Rebuilding FTS5 sparse index for {self.db_path}: "
+ f"documents={doc_count}, fts_rows={fts_count}",
+ )
+ await self._rebuild_fts_index_unlocked()
+ return self.fts5_available
async def rebuild_fts_index(self) -> None:
"""Rebuild the contentless FTS5 sparse index from documents."""
+ async with self._fts_rebuild_lock:
+ await self._rebuild_fts_index_unlocked()
+
+ async def _rebuild_fts_index_unlocked(self) -> None:
if not self.fts5_available:
return
@@ -577,7 +721,7 @@ async def search_sparse(
sparse retrieval implementation.
"""
if limit <= 0:
- return []
+ return None
if not await self.ensure_fts_index():
return None
diff --git a/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py b/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py
index dc6977cf8a..d7d9479046 100644
--- a/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py
+++ b/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py
@@ -4,21 +4,180 @@
raise ImportError(
"faiss 未安装。请使用 'pip install faiss-cpu' 或 'pip install faiss-gpu' 安装。",
)
+import asyncio
import os
+import shutil
+from datetime import datetime, timezone
+from pathlib import Path
import numpy as np
+def _safe_normalize_l2(vectors: np.ndarray) -> None:
+ """L2 归一化,对零向量抛出明确错误
+
+ 正常的 embedding 模型不应产生零向量。零向量无法归一化(会产生 NaN),
+ 说明 embedding provider 返回了异常数据,应当尽早暴露问题。
+ """
+ # 检测全零行
+ if vectors.ndim == 2:
+ norms = np.linalg.norm(vectors, axis=1, keepdims=True)
+ zero_count = int((norms < 1e-12).sum())
+ if zero_count > 0:
+ raise ValueError(
+ f"向量归一化失败:检测到 {zero_count} 个零向量。"
+ "Embedding Provider 返回了全零向量,这可能说明 API 密钥无效、"
+ "模型不支持当前输入、或服务端异常。请检查 Embedding Provider 配置。"
+ )
+ elif vectors.ndim == 1:
+ if np.linalg.norm(vectors) < 1e-12:
+ raise ValueError(
+ "向量归一化失败:检测到零向量。"
+ "Embedding Provider 返回了全零向量,这可能说明 API 密钥无效、"
+ "模型不支持当前输入、或服务端异常。请检查 Embedding Provider 配置。"
+ )
+
+ faiss.normalize_L2(vectors)
+
+
class EmbeddingStorage:
- def __init__(self, dimension: int, path: str | None = None) -> None:
+ def __init__(
+ self,
+ dimension: int,
+ path: str | None = None,
+ index_type: str = "flat",
+ ) -> None:
self.dimension = dimension
self.path = path
self.index = None
+ self.index_type = index_type # "flat" | "hnsw"
+ self._write_lock = asyncio.Lock()
if path and os.path.exists(path):
self.index = faiss.read_index(path)
+ # 验证加载的索引维度是否匹配
+ loaded_dim = self.index.d
+ if loaded_dim != self.dimension:
+ raise ValueError(
+ f"索引维度不匹配: 磁盘索引维度={loaded_dim}, "
+ f"当前 Embedding Provider 维度={self.dimension}。"
+ f"请确认 Embedding Provider 与已有索引一致,"
+ f"或删除旧索引后重新创建知识库。"
+ )
+ self._migrate_l2_to_ip_if_needed()
else:
- base_index = faiss.IndexFlatL2(dimension)
+ self.index = self._create_index()
+
+ def _create_index(self):
+ """根据 index_type 创建 FAISS 索引"""
+ if self.index_type == "hnsw":
+ # HNSW32 with Inner Product metric for cosine similarity
+ base_index = faiss.index_factory(
+ self.dimension,
+ "HNSW32",
+ faiss.METRIC_INNER_PRODUCT,
+ )
+ return faiss.IndexIDMap(base_index)
+ # 默认: flat (精确搜索)
+ return faiss.IndexIDMap(faiss.IndexFlatIP(self.dimension))
+
+ def _migrate_l2_to_ip_if_needed(self) -> None:
+ """检测并迁移旧版 L2 索引到 IP (余弦相似度)
+
+ 旧版使用 IndexFlatL2,新版使用 IndexFlatIP + 归一化向量。
+ 迁移过程:保留原 external ids → reconstruct 所有向量 → L2 归一化 → 重建为 IP 索引。
+ """
+ assert self.index is not None
+ # IndexIDMap 包装了 base index,需要解包检查
+ base_index = self.index.index if hasattr(self.index, "index") else self.index
+ if getattr(base_index, "metric_type", None) != faiss.METRIC_L2:
+ return # 已经是 IP 或其他类型,无需迁移
+
+ import warnings
+
+ ntotal = self.index.ntotal
+ if ntotal == 0:
+ warnings.warn(
+ "检测到空的旧版 L2 索引,将重建为 IP 索引。",
+ stacklevel=2,
+ )
+ base_index = faiss.IndexFlatIP(self.dimension)
self.index = faiss.IndexIDMap(base_index)
+ return
+
+ warnings.warn(
+ f"检测到旧版 L2 索引 (含 {ntotal} 个向量),正在自动迁移到 IP 索引..."
+ "这可能需要几秒钟。迁移后旧索引将被覆盖。",
+ stacklevel=2,
+ )
+
+ # 重建所有向量并归一化
+ # 注意: IndexIDMap.reconstruct 在某些 FAISS 构建版本中不可用
+ try:
+ ids = self._get_index_ids()
+ vectors = np.zeros((ntotal, self.dimension), dtype=np.float32)
+ reconstruct_index = (
+ self.index.index if hasattr(self.index, "index") else self.index
+ )
+ for pos in range(ntotal):
+ vectors[pos] = reconstruct_index.reconstruct(pos)
+ except Exception as exc:
+ raise RuntimeError(
+ "无法从旧索引重建向量(reconstruct 不可用),"
+ "已保留旧索引文件未覆盖。请重新上传文档或手动重建知识库索引。"
+ ) from exc
+
+ _safe_normalize_l2(vectors)
+
+ # 重建为 IP 索引
+ new_index = faiss.IndexIDMap(faiss.IndexFlatIP(self.dimension))
+ new_index.add_with_ids(vectors, ids)
+
+ self._backup_existing_index_before_migration()
+ self.index = new_index
+ # 立即保存迁移后的索引
+ faiss.write_index(self.index, self.path)
+
+ def _backup_existing_index_before_migration(self) -> Path:
+ if self.path is None:
+ raise RuntimeError("无法备份旧索引:索引文件路径为空,已保留旧索引未覆盖。")
+
+ index_path = Path(self.path)
+ if not index_path.exists():
+ raise RuntimeError(
+ f"无法备份旧索引:索引文件不存在 {index_path},已保留旧索引未覆盖。"
+ )
+
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+ backup_path = index_path.with_name(f"{index_path.name}.bak.{timestamp}")
+ counter = 1
+ while backup_path.exists():
+ backup_path = index_path.with_name(
+ f"{index_path.name}.bak.{timestamp}.{counter}"
+ )
+ counter += 1
+
+ try:
+ shutil.copy2(index_path, backup_path)
+ except OSError as exc:
+ raise RuntimeError(
+ f"无法备份旧索引到 {backup_path},已保留旧索引未覆盖。"
+ ) from exc
+
+ return backup_path
+
+ def _get_index_ids(self) -> np.ndarray:
+ assert self.index is not None
+ ntotal = self.index.ntotal
+ id_map = getattr(self.index, "id_map", None)
+ if id_map is None:
+ return np.arange(ntotal, dtype=np.int64)
+
+ ids = faiss.vector_to_array(id_map).astype(np.int64)
+ if len(ids) != ntotal:
+ raise RuntimeError(
+ f"FAISS IDMap 数量异常: ntotal={ntotal}, id_map={len(ids)}",
+ )
+ return ids
async def insert(self, vector: np.ndarray, id: int) -> None:
"""插入向量
@@ -30,13 +189,16 @@ async def insert(self, vector: np.ndarray, id: int) -> None:
ValueError: 如果向量的维度与存储的维度不匹配
"""
- assert self.index is not None, "FAISS index is not initialized."
- if vector.shape[0] != self.dimension:
- raise ValueError(
- f"向量维度不匹配, 期望: {self.dimension}, 实际: {vector.shape[0]}",
- )
- self.index.add_with_ids(vector.reshape(1, -1), np.array([id]))
- await self.save_index()
+ async with self._write_lock:
+ assert self.index is not None, "FAISS index is not initialized."
+ if vector.shape[0] != self.dimension:
+ raise ValueError(
+ f"向量维度不匹配, 期望: {self.dimension}, 实际: {vector.shape[0]}",
+ )
+ v_2d = vector.reshape(1, -1)
+ _safe_normalize_l2(v_2d)
+ self.index.add_with_ids(v_2d, np.array([id]))
+ await self._save_index_locked()
async def insert_batch(self, vectors: np.ndarray, ids: list[int]) -> None:
"""批量插入向量
@@ -48,13 +210,15 @@ async def insert_batch(self, vectors: np.ndarray, ids: list[int]) -> None:
ValueError: 如果向量的维度与存储的维度不匹配
"""
- assert self.index is not None, "FAISS index is not initialized."
- if vectors.shape[1] != self.dimension:
- raise ValueError(
- f"向量维度不匹配, 期望: {self.dimension}, 实际: {vectors.shape[1]}",
- )
- self.index.add_with_ids(vectors, np.array(ids))
- await self.save_index()
+ async with self._write_lock:
+ assert self.index is not None, "FAISS index is not initialized."
+ if vectors.shape[1] != self.dimension:
+ raise ValueError(
+ f"向量维度不匹配, 期望: {self.dimension}, 实际: {vectors.shape[1]}",
+ )
+ _safe_normalize_l2(vectors)
+ self.index.add_with_ids(vectors, np.array(ids))
+ await self._save_index_locked()
async def search(self, vector: np.ndarray, k: int) -> tuple:
"""搜索最相似的向量
@@ -67,7 +231,7 @@ async def search(self, vector: np.ndarray, k: int) -> tuple:
"""
assert self.index is not None, "FAISS index is not initialized."
- faiss.normalize_L2(vector)
+ _safe_normalize_l2(vector)
distances, indices = self.index.search(vector, k)
return distances, indices
@@ -78,18 +242,25 @@ async def delete(self, ids: list[int]) -> None:
ids (list[int]): 要删除的向量ID列表
"""
- assert self.index is not None, "FAISS index is not initialized."
- id_array = np.array(ids, dtype=np.int64)
- self.index.remove_ids(id_array)
- await self.save_index()
-
- async def save_index(self) -> None:
- """保存索引
+ async with self._write_lock:
+ assert self.index is not None, "FAISS index is not initialized."
+ id_array = np.array(ids, dtype=np.int64)
+ self.index.remove_ids(id_array)
+ await self._save_index_locked()
- Args:
- path (str): 保存索引的路径
+ async def _save_index_locked(self) -> None:
+ """内部方法:在已持有 _write_lock 的情况下保存索引到磁盘。
+ 调用者必须已经获取 _write_lock。
"""
if self.index is None:
return
- faiss.write_index(self.index, self.path)
+ await asyncio.to_thread(faiss.write_index, self.index, self.path)
+
+ async def save_index(self) -> None:
+ """保存索引(在单独线程中执行以避免阻塞事件循环)
+
+ 公共方法,自动获取写锁以确保线程安全。
+ """
+ async with self._write_lock:
+ await self._save_index_locked()
diff --git a/astrbot/core/db/vec_db/faiss_impl/vec_db.py b/astrbot/core/db/vec_db/faiss_impl/vec_db.py
index 0474683754..1cafd1c45d 100644
--- a/astrbot/core/db/vec_db/faiss_impl/vec_db.py
+++ b/astrbot/core/db/vec_db/faiss_impl/vec_db.py
@@ -1,5 +1,7 @@
import time
import uuid
+from collections import OrderedDict
+from hashlib import sha256
import numpy as np
@@ -12,6 +14,50 @@
from .embedding_storage import EmbeddingStorage
+class EmbeddingCache:
+ """基于 LRU 的文本 → 嵌入向量缓存(线程安全)
+
+ 使用 SHA256 哈希文本作为缓存 key,避免对相同内容重复调用 embedding API。
+ """
+
+ def __init__(self, max_size: int = 10000) -> None:
+ import asyncio
+
+ self._cache: OrderedDict[str, np.ndarray] = OrderedDict()
+ self._max_size = max_size
+ self._lock = asyncio.Lock()
+
+ @staticmethod
+ def _hash(text: str) -> str:
+ return sha256(text.encode()).hexdigest()
+
+ async def get(self, text: str) -> np.ndarray | None:
+ async with self._lock:
+ key = self._hash(text)
+ if key in self._cache:
+ self._cache.move_to_end(key)
+ return self._cache[key].copy()
+ return None
+
+ async def put(self, text: str, embedding: np.ndarray) -> None:
+ async with self._lock:
+ key = self._hash(text)
+ if key not in self._cache:
+ if len(self._cache) >= self._max_size:
+ self._cache.popitem(last=False)
+ else:
+ self._cache.move_to_end(key)
+ self._cache[key] = embedding.copy()
+
+ async def clear(self) -> None:
+ async with self._lock:
+ self._cache.clear()
+
+ async def __len__(self) -> int:
+ async with self._lock:
+ return len(self._cache)
+
+
class FaissVecDB(BaseVecDB):
"""A class to represent a vector database."""
@@ -21,6 +67,7 @@ def __init__(
index_store_path: str,
embedding_provider: EmbeddingProvider,
rerank_provider: RerankProvider | None = None,
+ index_type: str = "flat",
) -> None:
self.doc_store_path = doc_store_path
self.index_store_path = index_store_path
@@ -29,9 +76,11 @@ def __init__(
self.embedding_storage = EmbeddingStorage(
embedding_provider.get_dim(),
index_store_path,
+ index_type=index_type,
)
self.embedding_provider = embedding_provider
self.rerank_provider = rerank_provider
+ self.embedding_cache = EmbeddingCache()
async def initialize(self) -> None:
await self.document_storage.initialize()
@@ -81,6 +130,9 @@ async def insert_batch(
)
return []
+ # 空列表快速返回后,确保不再处理零向量
+ assert len(contents) > 0, "contents must not be empty"
+
content_count = len(contents)
if len(metadatas) != content_count:
raise KnowledgeBaseUploadError(
@@ -107,54 +159,65 @@ async def insert_batch(
},
)
+ # 检查嵌入缓存,分离已缓存的文本和需要计算的文本
start = time.time()
- logger.debug(f"Generating embeddings for {len(contents)} contents...")
- vectors = await self.embedding_provider.get_embeddings_batch(
- contents,
- batch_size=batch_size,
- tasks_limit=tasks_limit,
- max_retries=max_retries,
- progress_callback=progress_callback,
- )
- end = time.time()
+ cached_vectors: dict[int, np.ndarray] = {}
+ uncached_indices: list[int] = []
+ uncached_texts: list[str] = []
+
+ for idx, text in enumerate(contents):
+ cached = await self.embedding_cache.get(text)
+ if cached is not None:
+ cached_vectors[idx] = cached
+ else:
+ uncached_indices.append(idx)
+ uncached_texts.append(text)
+
+ cache_hits = len(cached_vectors)
+ cache_misses = len(uncached_texts)
logger.debug(
- f"Generated embeddings for {len(contents)} contents in {end - start:.2f} seconds.",
+ f"Embedding cache: {cache_hits} hits, {cache_misses} misses "
+ f"out of {len(contents)} contents.",
)
- if len(vectors) != content_count:
- raise KnowledgeBaseUploadError(
- stage="embedding",
- user_message=(
- "向量化失败:嵌入模型返回的向量数量与文本分块数量不一致"
- f"(期望 {content_count},实际 {len(vectors)})。"
- "这通常说明当前 Embedding 接口未完整返回批量结果,"
- "或该服务不兼容当前批量请求格式。"
- ),
- details={
- "expected_contents": content_count,
- "actual_vectors": len(vectors),
- },
+
+ # 只对未缓存的文本生成嵌入
+ vectors = [np.empty(0, dtype=np.float32) for _ in contents]
+ if uncached_texts:
+ new_embeddings = await self.embedding_provider.get_embeddings_batch(
+ uncached_texts,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
)
+ # 验证返回数量
+ if len(new_embeddings) != len(uncached_texts):
+ raise KnowledgeBaseUploadError(
+ stage="embedding",
+ user_message=(
+ "向量化失败:嵌入模型返回的向量数量与文本分块数量不一致"
+ f"(期望 {len(uncached_texts)},实际 {len(new_embeddings)})。"
+ "这通常说明当前 Embedding 接口未完整返回批量结果,"
+ "或该服务不兼容当前批量请求格式。"
+ ),
+ details={
+ "expected_contents": len(uncached_texts),
+ "actual_vectors": len(new_embeddings),
+ },
+ )
+ for i, idx in enumerate(uncached_indices):
+ vectors[idx] = np.asarray(new_embeddings[i], dtype=np.float32)
+ await self.embedding_cache.put(uncached_texts[i], vectors[idx])
+
+ for idx, cached_vec in cached_vectors.items():
+ vectors[idx] = cached_vec
- # 使用 DocumentStorage 的批量插入方法
- int_ids = await self.document_storage.insert_documents_batch(
- ids,
- contents,
- metadatas,
+ end = time.time()
+ logger.debug(
+ f"Embeddings ready for {len(contents)} contents "
+ f"in {end - start:.2f}s (cached: {cache_hits}, fresh: {cache_misses}).",
)
- if len(int_ids) != content_count:
- raise KnowledgeBaseUploadError(
- stage="storage",
- user_message=(
- f"存储失败:写入文档索引后返回的内部 ID 数量与文本分块数量不一致"
- f"(期望 {content_count},实际 {len(int_ids)})。"
- ),
- details={
- "expected_contents": content_count,
- "actual_int_ids": len(int_ids),
- },
- )
- # 批量插入向量到 FAISS
try:
vectors_array = np.asarray(vectors, dtype=np.float32)
except (TypeError, ValueError) as exc:
@@ -187,9 +250,63 @@ async def insert_batch(
"actual_dimension": int(vectors_array.shape[1]),
},
)
- await self.embedding_storage.insert_batch(vectors_array, int_ids)
+
+ int_ids = await self.document_storage.insert_documents_batch(
+ ids,
+ contents,
+ metadatas,
+ )
+ if len(int_ids) != content_count:
+ await self._cleanup_batch_insert(int_ids=[], doc_ids=ids)
+ raise KnowledgeBaseUploadError(
+ stage="storage",
+ user_message=(
+ f"存储失败:写入文档索引后返回的内部 ID 数量与文本分块数量不一致"
+ f"(期望 {content_count},实际 {len(int_ids)})。"
+ ),
+ details={
+ "expected_contents": content_count,
+ "actual_int_ids": len(int_ids),
+ },
+ )
+
+ try:
+ await self.embedding_storage.insert_batch(vectors_array, int_ids)
+ except Exception:
+ logger.warning(
+ "Failed to insert FAISS vectors; cleaning up inserted document rows.",
+ exc_info=True,
+ )
+ await self._cleanup_batch_insert(int_ids=int_ids, doc_ids=ids)
+ raise
return int_ids
+ async def _cleanup_batch_insert(
+ self,
+ *,
+ int_ids: list[int],
+ doc_ids: list[str],
+ ) -> None:
+ """Best-effort cleanup for a failed batch insert."""
+ if int_ids:
+ try:
+ await self.embedding_storage.delete(int_ids)
+ except Exception:
+ logger.warning(
+ "Failed to clean up FAISS vectors after batch insert failure.",
+ exc_info=True,
+ )
+
+ for doc_id in doc_ids:
+ try:
+ await self.document_storage.delete_document_by_doc_id(doc_id)
+ except Exception:
+ logger.warning(
+ f"Failed to clean up document row {doc_id} "
+ "after batch insert failure.",
+ exc_info=True,
+ )
+
async def retrieve(
self,
query: str,
@@ -211,15 +328,24 @@ async def retrieve(
List[Result]: 查询结果
"""
- embedding = await self.embedding_provider.get_embedding(query)
+ # 先查缓存,再调 embedding provider
+ cached = await self.embedding_cache.get(query)
+ if cached is not None:
+ embedding = cached
+ else:
+ embedding = await self.embedding_provider.get_embedding(query)
+ await self.embedding_cache.put(
+ query,
+ np.asarray(embedding, dtype=np.float32),
+ )
scores, indices = await self.embedding_storage.search(
vector=np.array([embedding]).astype("float32"),
k=fetch_k if metadata_filters else k,
)
if len(indices[0]) == 0 or indices[0][0] == -1:
return []
- # normalize scores
- scores[0] = 1.0 - (scores[0] / 2.0)
+ # 将内积分数 (余弦相似度, 范围 [-1, 1]) 映射到 [0, 1]
+ scores[0] = (scores[0] + 1.0) / 2.0
# NOTE: maybe the size is less than k.
fetched_docs = await self.document_storage.get_documents(
metadata_filters=metadata_filters or {},
@@ -255,17 +381,18 @@ async def retrieve(
return top_k_results
- async def delete(self, doc_id: str) -> None:
+ async def delete(self, doc_id: str) -> bool:
"""删除一条文档块(chunk)"""
# 获得对应的 int id
result = await self.document_storage.get_document_by_doc_id(doc_id)
int_id = result["id"] if result else None
if int_id is None:
- return
+ return False
# 使用 DocumentStorage 的删除方法
await self.document_storage.delete_document_by_doc_id(doc_id)
await self.embedding_storage.delete([int_id])
+ return True
async def close(self) -> None:
await self.document_storage.close()
diff --git a/astrbot/core/knowledge_base/capabilities.py b/astrbot/core/knowledge_base/capabilities.py
new file mode 100644
index 0000000000..9367604514
--- /dev/null
+++ b/astrbot/core/knowledge_base/capabilities.py
@@ -0,0 +1,110 @@
+"""Knowledge base capabilities and default limits."""
+
+from typing import Any
+
+ALLOWED_UPLOAD_EXTENSIONS = frozenset(
+ {
+ "adoc",
+ "docx",
+ "epub",
+ "md",
+ "markdown",
+ "pdf",
+ "rst",
+ "txt",
+ "xls",
+ "xlsx",
+ },
+)
+
+MAX_UPLOAD_FILE_SIZE = 128 * 1024 * 1024
+MAX_UPLOAD_FILES = 10
+MAX_BATCH_DELETE_DOCUMENTS = 100
+MAX_BATCH_REBUILD_DOCUMENTS = 100
+MAX_RETRIEVE_TOP_K = 100
+DEFAULT_KB_PAGE_SIZE = 20
+DEFAULT_DOCUMENT_PAGE_SIZE = 10
+DEFAULT_CHUNK_PAGE_SIZE = 10
+DEFAULT_BULK_PAGE_SIZE = 100
+DOCUMENT_PAGE_SIZE_OPTIONS = (10, 20, 50, 100)
+CHUNK_PAGE_SIZE_OPTIONS = (10, 25, 50, 100)
+
+DOCUMENT_FILTER_STATUSES = (
+ "pending",
+ "parsing",
+ "chunking",
+ "embedding",
+ "ready",
+ "failed",
+)
+DOCUMENT_FILTER_SOURCE_TYPES = ("file", "url", "import")
+
+FEATURE_SPARSE_RETRIEVAL = True
+FEATURE_RERANK = True
+FEATURE_URL_IMPORT = True
+FEATURE_DOCUMENT_REBUILD = True
+FEATURE_KB_REBUILD = True
+FEATURE_CONSISTENCY_CHECK = True
+FEATURE_CONSISTENCY_REPAIR = True
+FEATURE_BATCH_DELETE = True
+FEATURE_BATCH_REBUILD = True
+
+DEFAULT_CHUNK_SIZE = 512
+DEFAULT_CHUNK_OVERLAP = 50
+DEFAULT_TOP_K_DENSE = 50
+DEFAULT_TOP_K_SPARSE = 50
+DEFAULT_TOP_M_FINAL = 5
+DEFAULT_INDEX_TYPE = "flat"
+DEFAULT_UPLOAD_BATCH_SIZE = 32
+DEFAULT_UPLOAD_TASKS_LIMIT = 3
+DEFAULT_UPLOAD_MAX_RETRIES = 3
+
+
+def get_knowledge_base_capabilities() -> dict[str, Any]:
+ """Return API-safe knowledge base capabilities."""
+ return {
+ "upload": {
+ "allowed_extensions": sorted(ALLOWED_UPLOAD_EXTENSIONS),
+ "max_file_size_bytes": MAX_UPLOAD_FILE_SIZE,
+ "max_files_per_upload": MAX_UPLOAD_FILES,
+ },
+ "defaults": {
+ "chunk_size": DEFAULT_CHUNK_SIZE,
+ "chunk_overlap": DEFAULT_CHUNK_OVERLAP,
+ "batch_size": DEFAULT_UPLOAD_BATCH_SIZE,
+ "tasks_limit": DEFAULT_UPLOAD_TASKS_LIMIT,
+ "max_retries": DEFAULT_UPLOAD_MAX_RETRIES,
+ "top_k_dense": DEFAULT_TOP_K_DENSE,
+ "top_k_sparse": DEFAULT_TOP_K_SPARSE,
+ "top_m_final": DEFAULT_TOP_M_FINAL,
+ "index_type": DEFAULT_INDEX_TYPE,
+ },
+ "limits": {
+ "max_retrieve_top_k": MAX_RETRIEVE_TOP_K,
+ "max_batch_delete_documents": MAX_BATCH_DELETE_DOCUMENTS,
+ "max_batch_rebuild_documents": MAX_BATCH_REBUILD_DOCUMENTS,
+ },
+ "pagination": {
+ "document_page_size_options": list(DOCUMENT_PAGE_SIZE_OPTIONS),
+ "chunk_page_size_options": list(CHUNK_PAGE_SIZE_OPTIONS),
+ "default_kb_page_size": DEFAULT_KB_PAGE_SIZE,
+ "default_document_page_size": DEFAULT_DOCUMENT_PAGE_SIZE,
+ "default_chunk_page_size": DEFAULT_CHUNK_PAGE_SIZE,
+ "bulk_page_size": DEFAULT_BULK_PAGE_SIZE,
+ },
+ "document_filters": {
+ "statuses": list(DOCUMENT_FILTER_STATUSES),
+ "source_types": list(DOCUMENT_FILTER_SOURCE_TYPES),
+ },
+ "features": {
+ "sparse_retrieval": FEATURE_SPARSE_RETRIEVAL,
+ "rerank": FEATURE_RERANK,
+ "url_import": FEATURE_URL_IMPORT,
+ "document_rebuild": FEATURE_DOCUMENT_REBUILD,
+ "kb_rebuild": FEATURE_KB_REBUILD,
+ "consistency_check": FEATURE_CONSISTENCY_CHECK,
+ "consistency_repair": FEATURE_CONSISTENCY_REPAIR,
+ "batch_delete": FEATURE_BATCH_DELETE,
+ "batch_rebuild": FEATURE_BATCH_REBUILD,
+ },
+ }
diff --git a/astrbot/core/knowledge_base/chunking/markdown.py b/astrbot/core/knowledge_base/chunking/markdown.py
index 9ace43110d..e8813bf470 100644
--- a/astrbot/core/knowledge_base/chunking/markdown.py
+++ b/astrbot/core/knowledge_base/chunking/markdown.py
@@ -16,10 +16,35 @@ class _Section:
"""解析后的 Markdown 章节"""
heading_path: list[str]
+ title_path: list[str]
+ section_index: int | None
text: str
has_body: bool
+@dataclass
+class MarkdownChunk:
+ """A Markdown chunk with source structure metadata."""
+
+ text: str
+ title_path: list[str] | None = None
+ section_index: int | None = None
+
+
+@dataclass
+class _ChunkDraft:
+ text: str
+ has_body: bool
+ title_path: list[str] | None
+ section_index: int | None
+
+
+@dataclass
+class _MarkdownBlock:
+ kind: str
+ text: str
+
+
class MarkdownChunker(BaseChunker):
"""Markdown 感知分块器
@@ -72,31 +97,29 @@ async def chunk(self, text: str, **kwargs) -> list[str]:
list[str]: 分块后的文本列表
"""
+ chunks = await self.chunk_with_metadata(text, **kwargs)
+ return [chunk.text for chunk in chunks]
+
+ async def chunk_with_metadata(self, text: str, **kwargs) -> list[MarkdownChunk]:
+ """Split Markdown text and keep per-chunk structure metadata."""
+ text = self._strip_front_matter(text)
if not text or not text.strip():
return []
chunk_size = kwargs.get("chunk_size", self.chunk_size)
chunk_overlap = kwargs.get("chunk_overlap", self.chunk_overlap)
- # 解析 Markdown 结构
sections = self._parse_sections(text)
if not sections:
- # 没有识别到标题结构,回退到递归分割
- return await self._fallback_chunker.chunk(
+ chunks = await self._split_section_preserving_blocks(
text, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
+ return [MarkdownChunk(text=chunk) for chunk in chunks]
- # 将 sections 转换为 raw chunks
raw_chunks = await self._sections_to_chunks(sections, chunk_size, chunk_overlap)
-
- # 合并纯标题节到下一个有内容的 chunk
merged = self._merge_heading_only_chunks(raw_chunks, chunk_size)
-
- # 合并过短的相邻 chunk
- merged = self._merge_short_chunks(merged, chunk_size)
-
- return merged
+ return self._merge_short_chunks(merged, chunk_size)
def _estimate_prefix_length(self, heading_path: list[str]) -> int:
"""估算标题上下文前缀的最大长度(用于扣除子块可用空间)"""
@@ -109,13 +132,15 @@ def _estimate_prefix_length(self, heading_path: list[str]) -> int:
async def _sections_to_chunks(
self, sections: list[_Section], chunk_size: int, chunk_overlap: int
- ) -> list[tuple[str, bool]]:
+ ) -> list[_ChunkDraft]:
"""将解析后的 sections 转换为 (chunk_text, has_body) 列表"""
- raw_chunks: list[tuple[str, bool]] = []
+ raw_chunks: list[_ChunkDraft] = []
for section in sections:
section_text = section.text
heading_path = section.heading_path
+ title_path = self._normalize_title_path(section.title_path)
+ section_index = section.section_index
has_body = section.has_body
# 构建带上下文的文本
@@ -123,23 +148,30 @@ async def _sections_to_chunks(
full_text = context_prefix + section_text
if len(full_text) <= chunk_size:
- raw_chunks.append((full_text.strip(), has_body))
+ raw_chunks.append(
+ _ChunkDraft(
+ text=full_text.strip(),
+ has_body=has_body,
+ title_path=title_path,
+ section_index=section_index,
+ )
+ )
else:
- # 章节过长,内部递归分割
- # 扣除前缀长度,确保添加前缀后不超过 chunk_size
- prefix_len = self._estimate_prefix_length(heading_path)
- effective_chunk_size = max(chunk_size // 4, chunk_size - prefix_len)
-
- sub_chunks = await self._fallback_chunker.chunk(
+ sub_chunks = await self._split_section_preserving_blocks(
section_text,
- chunk_size=effective_chunk_size,
+ heading_path=heading_path,
+ chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
for i, sub_chunk in enumerate(sub_chunks):
- chunk_text = self._apply_heading_context(
- heading_path, sub_chunk, is_continuation=(i > 0)
+ raw_chunks.append(
+ _ChunkDraft(
+ text=sub_chunk,
+ has_body=True,
+ title_path=title_path,
+ section_index=section_index,
+ )
)
- raw_chunks.append((chunk_text, True))
return raw_chunks
@@ -161,75 +193,818 @@ def _apply_heading_context(
return f"{self.continuation_prefix} {title}\n\n{content}".strip()
return f"{title}\n\n{content}".strip()
- def _merge_heading_only_chunks(
- self, raw_chunks: list[tuple[str, bool]], chunk_size: int
+ async def _split_section_preserving_blocks(
+ self,
+ text: str,
+ *,
+ chunk_size: int,
+ chunk_overlap: int,
+ heading_path: list[str] | None = None,
+ ) -> list[str]:
+ heading_path = heading_path or []
+ prefix_len = self._estimate_prefix_length(heading_path)
+ effective_chunk_size = max(chunk_size // 4, chunk_size - prefix_len)
+ blocks = self._parse_markdown_blocks(text)
+ if not blocks:
+ chunks = await self._fallback_chunker.chunk(
+ text,
+ chunk_size=effective_chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+ return [
+ self._apply_heading_context(heading_path, chunk, i > 0)
+ for i, chunk in enumerate(chunks)
+ if chunk.strip()
+ ]
+
+ chunks: list[str] = []
+ current = ""
+ piece_index = 0
+
+ for block in blocks:
+ pieces = await self._split_block(block, effective_chunk_size, chunk_overlap)
+ for piece in pieces:
+ piece = piece.strip()
+ if not piece:
+ continue
+ if not current:
+ current = piece
+ continue
+ combined = current + "\n\n" + piece
+ if len(combined) <= effective_chunk_size:
+ current = combined
+ continue
+
+ chunks.append(
+ self._apply_heading_context(
+ heading_path,
+ current,
+ piece_index > 0,
+ )
+ )
+ piece_index += 1
+ current = piece
+
+ if current:
+ chunks.append(
+ self._apply_heading_context(
+ heading_path,
+ current,
+ piece_index > 0,
+ )
+ )
+
+ return chunks
+
+ async def _split_block(
+ self, block: _MarkdownBlock, chunk_size: int, chunk_overlap: int
+ ) -> list[str]:
+ text = block.text.strip()
+ if not text:
+ return []
+ if len(text) <= chunk_size:
+ return [text]
+
+ if block.kind == "table":
+ return self._split_table_block(text, chunk_size)
+ if block.kind == "code":
+ return self._split_fenced_code_block(text, chunk_size)
+ if block.kind == "math":
+ return self._split_wrapped_line_block(text, chunk_size)
+ if block.kind in {"blockquote", "list", "html"}:
+ return self._split_line_block(text, chunk_size)
+ if block.kind in {"paragraph", "text"}:
+ return self._split_text_preserving_inline_spans(text, chunk_size)
+
+ return await self._fallback_chunker.chunk(
+ text,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+
+ def _parse_markdown_blocks(self, text: str) -> list[_MarkdownBlock]:
+ lines = text.splitlines(keepends=True)
+ blocks: list[_MarkdownBlock] = []
+ i = 0
+ while i < len(lines):
+ line = lines[i]
+ if not line.strip():
+ i += 1
+ continue
+
+ if self._is_fence_start(line):
+ block_lines, i = self._collect_fenced_code_block(lines, i)
+ blocks.append(_MarkdownBlock("code", "".join(block_lines).strip()))
+ continue
+
+ if self._is_math_block_start(line):
+ block_lines, i = self._collect_math_block(lines, i)
+ blocks.append(_MarkdownBlock("math", "".join(block_lines).strip()))
+ continue
+
+ if self._is_markdown_table_start(lines, i):
+ block_lines, i = self._collect_markdown_table(lines, i)
+ blocks.append(_MarkdownBlock("table", "".join(block_lines).strip()))
+ continue
+
+ if self._is_html_block_start(line):
+ block_lines, i = self._collect_html_block(lines, i)
+ blocks.append(_MarkdownBlock("html", "".join(block_lines).strip()))
+ continue
+
+ if line.lstrip().startswith(">"):
+ block_lines, i = self._collect_prefixed_block(
+ lines,
+ i,
+ lambda candidate: candidate.lstrip().startswith(">"),
+ )
+ blocks.append(
+ _MarkdownBlock("blockquote", "".join(block_lines).strip())
+ )
+ continue
+
+ if self._is_list_item(line):
+ block_lines, i = self._collect_list_block(lines, i)
+ blocks.append(_MarkdownBlock("list", "".join(block_lines).strip()))
+ continue
+
+ if self._is_link_reference(line):
+ block_lines, i = self._collect_prefixed_block(
+ lines,
+ i,
+ self._is_link_reference,
+ )
+ blocks.append(
+ _MarkdownBlock("link_reference", "".join(block_lines).strip())
+ )
+ continue
+
+ block_lines, i = self._collect_paragraph(lines, i)
+ blocks.append(_MarkdownBlock("paragraph", "".join(block_lines).strip()))
+
+ return [block for block in blocks if block.text.strip()]
+
+ @staticmethod
+ def _strip_front_matter(text: str) -> str:
+ if not text.startswith(("---\n", "+++\n")):
+ return text
+
+ marker = text[:3]
+ lines = text.splitlines(keepends=True)
+ for idx in range(1, min(len(lines), 200)):
+ if lines[idx].strip() == marker:
+ return "".join(lines[idx + 1 :]).lstrip("\n")
+ return text
+
+ @staticmethod
+ def _is_fence_start(line: str) -> bool:
+ stripped = line.lstrip()
+ indent = len(line) - len(stripped)
+ return indent <= 3 and (
+ stripped.startswith("```") or stripped.startswith("~~~")
+ )
+
+ @staticmethod
+ def _fence_marker(line: str) -> tuple[str, int] | None:
+ stripped = line.lstrip()
+ if stripped.startswith("```"):
+ return "`", len(stripped) - len(stripped.lstrip("`"))
+ if stripped.startswith("~~~"):
+ return "~", len(stripped) - len(stripped.lstrip("~"))
+ return None
+
+ def _collect_fenced_code_block(
+ self, lines: list[str], start: int
+ ) -> tuple[list[str], int]:
+ marker = self._fence_marker(lines[start])
+ if marker is None:
+ return [lines[start]], start + 1
+ fence_char, fence_len = marker
+ block_lines = [lines[start]]
+ i = start + 1
+ while i < len(lines):
+ block_lines.append(lines[i])
+ candidate = lines[i].lstrip()
+ indent = len(lines[i]) - len(candidate)
+ if (
+ indent <= 3
+ and candidate.startswith(fence_char * fence_len)
+ and set(candidate.strip()) <= {fence_char}
+ ):
+ i += 1
+ break
+ i += 1
+ return block_lines, i
+
+ @staticmethod
+ def _is_table_separator(line: str) -> bool:
+ stripped = line.strip()
+ if "|" not in stripped:
+ return False
+ cells = [cell.strip() for cell in stripped.strip("|").split("|")]
+ if not cells:
+ return False
+ return all(re.fullmatch(r":?-{3,}:?", cell or "") for cell in cells)
+
+ @staticmethod
+ def _is_table_row(line: str) -> bool:
+ stripped = line.strip()
+ return bool(stripped) and "|" in stripped
+
+ def _is_markdown_table_start(self, lines: list[str], index: int) -> bool:
+ return (
+ index + 1 < len(lines)
+ and self._is_table_row(lines[index])
+ and self._is_table_separator(lines[index + 1])
+ )
+
+ def _collect_markdown_table(
+ self, lines: list[str], start: int
+ ) -> tuple[list[str], int]:
+ block_lines = [lines[start], lines[start + 1]]
+ i = start + 2
+ while i < len(lines) and self._is_table_row(lines[i]):
+ block_lines.append(lines[i])
+ i += 1
+ return block_lines, i
+
+ @staticmethod
+ def _is_html_block_start(line: str) -> bool:
+ stripped = line.lstrip().lower()
+ return stripped.startswith(
+ (
+ "
str | None:
+ stripped = line.lstrip().lower()
+ for tag in ("table", "pre", "code", "blockquote", "details", "div"):
+ if stripped.startswith(f"<{tag}"):
+ return f"{tag}>"
+ return None
+
+ def _collect_html_block(
+ self, lines: list[str], start: int
+ ) -> tuple[list[str], int]:
+ closing_tag = self._html_closing_tag(lines[start])
+ block_lines = [lines[start]]
+ i = start + 1
+ if closing_tag is None or closing_tag in lines[start].lower():
+ return block_lines, i
+
+ while i < len(lines):
+ block_lines.append(lines[i])
+ if closing_tag in lines[i].lower():
+ i += 1
+ break
+ i += 1
+ return block_lines, i
+
+ @staticmethod
+ def _is_list_item(line: str) -> bool:
+ return bool(re.match(r"^\s{0,3}(?:[-*+]|\d+[.)])\s+", line))
+
+ @staticmethod
+ def _is_link_reference(line: str) -> bool:
+ return bool(re.match(r"^\s{0,3}\[[^\]]+\]:\s+\S+", line))
+
+ def _collect_prefixed_block(
+ self,
+ lines: list[str],
+ start: int,
+ predicate,
+ ) -> tuple[list[str], int]:
+ block_lines = []
+ i = start
+ while i < len(lines) and (predicate(lines[i]) or not lines[i].strip()):
+ if (
+ not lines[i].strip()
+ and i + 1 < len(lines)
+ and not predicate(lines[i + 1])
+ ):
+ break
+ block_lines.append(lines[i])
+ i += 1
+ return block_lines, i
+
+ def _collect_list_block(
+ self, lines: list[str], start: int
+ ) -> tuple[list[str], int]:
+ block_lines = [lines[start]]
+ i = start + 1
+ while i < len(lines):
+ line = lines[i]
+ if self._is_fence_start(line) or self._is_markdown_table_start(lines, i):
+ break
+ if self._is_list_item(line) or line.startswith((" ", "\t")):
+ block_lines.append(line)
+ i += 1
+ continue
+ if not line.strip() and i + 1 < len(lines):
+ next_line = lines[i + 1]
+ if self._is_list_item(next_line) or next_line.startswith((" ", "\t")):
+ block_lines.append(line)
+ i += 1
+ continue
+ break
+ return block_lines, i
+
+ def _collect_paragraph(self, lines: list[str], start: int) -> tuple[list[str], int]:
+ block_lines = []
+ i = start
+ while i < len(lines):
+ line = lines[i]
+ if not line.strip():
+ break
+ if i != start and (
+ self._is_fence_start(line)
+ or self._is_math_block_start(line)
+ or self._is_markdown_table_start(lines, i)
+ or self._is_html_block_start(line)
+ or self._is_list_item(line)
+ or line.lstrip().startswith(">")
+ or self._is_link_reference(line)
+ ):
+ break
+ block_lines.append(line)
+ i += 1
+ return block_lines, i
+
+ def _split_table_block(self, text: str, chunk_size: int) -> list[str]:
+ lines = text.splitlines()
+ if len(lines) <= 2:
+ return [text]
+
+ header = lines[:2]
+ rows = lines[2:]
+ chunks = []
+ current_rows: list[str] = []
+
+ for row in rows:
+ candidate_lines = header + current_rows + [row]
+ candidate = "\n".join(candidate_lines)
+ if current_rows and len(candidate) > chunk_size:
+ chunks.append("\n".join(header + current_rows))
+ current_rows = [row]
+ else:
+ current_rows.append(row)
+
+ if current_rows:
+ chunks.append("\n".join(header + current_rows))
+
+ return chunks or [text]
+
+ @staticmethod
+ def _is_math_block_start(line: str) -> bool:
+ stripped = line.strip()
+ return (
+ stripped.startswith("$$")
+ or stripped.startswith(r"\[")
+ or bool(
+ re.match(
+ r"^\\begin\{(?:equation|align|gather|multline|cases)\*?\}", stripped
+ )
+ )
+ )
+
+ @staticmethod
+ def _math_block_closer(line: str) -> str:
+ stripped = line.strip()
+ if stripped.startswith("$$"):
+ return "$$"
+ if stripped.startswith(r"\["):
+ return r"\]"
+
+ env_match = re.match(r"^\\begin\{([^}]+)\}", stripped)
+ if env_match:
+ return rf"\end{{{env_match.group(1)}}}"
+ return ""
+
+ def _collect_math_block(
+ self, lines: list[str], start: int
+ ) -> tuple[list[str], int]:
+ opener_line = lines[start]
+ closer = self._math_block_closer(opener_line)
+ block_lines = [opener_line]
+ if not closer:
+ return block_lines, start + 1
+
+ opener_stripped = opener_line.strip()
+ if (
+ closer in opener_stripped[len(closer) :]
+ if closer in {"$$", r"\]"}
+ else closer in opener_stripped
+ ):
+ return block_lines, start + 1
+
+ i = start + 1
+ while i < len(lines):
+ block_lines.append(lines[i])
+ if closer in lines[i].strip():
+ i += 1
+ break
+ i += 1
+ return block_lines, i
+
+ @staticmethod
+ def _split_wrapped_line_block(text: str, chunk_size: int) -> list[str]:
+ lines = text.splitlines()
+ if len(lines) <= 2:
+ return [text]
+
+ opener = lines[0]
+ closer = lines[-1]
+ body = lines[1:-1]
+ chunks = []
+ current: list[str] = []
+
+ for line in body:
+ candidate = "\n".join([opener, *current, line, closer])
+ if current and len(candidate) > chunk_size:
+ chunks.append("\n".join([opener, *current, closer]))
+ current = [line]
+ else:
+ current.append(line)
+
+ if current:
+ chunks.append("\n".join([opener, *current, closer]))
+
+ return chunks or [text]
+
+ @staticmethod
+ def _split_fenced_code_block(text: str, chunk_size: int) -> list[str]:
+ lines = text.splitlines()
+ if len(lines) <= 2:
+ return [text]
+
+ opener = lines[0]
+ closer = lines[-1] if lines[-1].lstrip().startswith(("```", "~~~")) else ""
+ body = lines[1:-1] if closer else lines[1:]
+ chunks = []
+ current: list[str] = []
+
+ for line in body:
+ candidate_lines = [opener, *current, line]
+ if closer:
+ candidate_lines.append(closer)
+ candidate = "\n".join(candidate_lines)
+ if current and len(candidate) > chunk_size:
+ chunk_lines = [opener, *current]
+ if closer:
+ chunk_lines.append(closer)
+ chunks.append("\n".join(chunk_lines))
+ current = [line]
+ else:
+ current.append(line)
+
+ if current:
+ chunk_lines = [opener, *current]
+ if closer:
+ chunk_lines.append(closer)
+ chunks.append("\n".join(chunk_lines))
+
+ return chunks or [text]
+
+ @staticmethod
+ def _split_line_block(text: str, chunk_size: int) -> list[str]:
+ lines = text.splitlines()
+ chunks = []
+ current: list[str] = []
+ for line in lines:
+ candidate = "\n".join([*current, line])
+ if current and len(candidate) > chunk_size:
+ chunks.append("\n".join(current))
+ current = [line]
+ else:
+ current.append(line)
+ if current:
+ chunks.append("\n".join(current))
+ return chunks or [text]
+
+ def _split_text_preserving_inline_spans(
+ self, text: str, chunk_size: int
) -> list[str]:
+ tokens = self._tokenize_protected_inline_spans(text)
+ chunks = []
+ current = ""
+ for token in tokens:
+ if not token:
+ continue
+ candidate = current + token if current else token.lstrip()
+ if current and len(candidate) > chunk_size:
+ chunks.append(current.strip())
+ current = token.lstrip()
+ else:
+ current = candidate
+
+ if len(current) > chunk_size and not self._is_inline_protected_token(
+ current
+ ):
+ split_chunks = self._split_long_plain_token(current, chunk_size)
+ chunks.extend(split_chunks[:-1])
+ current = split_chunks[-1] if split_chunks else ""
+
+ if current.strip():
+ chunks.append(current.strip())
+ return [chunk for chunk in chunks if chunk]
+
+ def _tokenize_protected_inline_spans(self, text: str) -> list[str]:
+ spans = self._find_protected_inline_spans(text)
+ tokens: list[str] = []
+ cursor = 0
+ for start, end in spans:
+ if start > cursor:
+ tokens.extend(re.findall(r"\S+\s*|\s+", text[cursor:start]))
+ tokens.append(text[start:end])
+ cursor = end
+ if cursor < len(text):
+ tokens.extend(re.findall(r"\S+\s*|\s+", text[cursor:]))
+ return tokens
+
+ def _find_protected_inline_spans(self, text: str) -> list[tuple[int, int]]:
+ spans: list[tuple[int, int]] = []
+ i = 0
+ while i < len(text):
+ end = self._match_markdown_link(text, i)
+ if end is None:
+ end = self._match_autolink(text, i)
+ if end is None:
+ end = self._match_inline_math(text, i)
+ if end is not None:
+ if not spans or i >= spans[-1][1]:
+ spans.append((i, end))
+ i = end
+ continue
+ i += 1
+ return spans
+
+ @staticmethod
+ def _match_markdown_link(text: str, start: int) -> int | None:
+ marker_start = start
+ if text.startswith("![", start):
+ start += 1
+ elif text[start] != "[":
+ return None
+
+ label_end = text.find("]", start + 1)
+ if label_end == -1 or label_end + 1 >= len(text):
+ return None
+
+ next_char = text[label_end + 1]
+ if next_char == "(":
+ link_end = text.find(")", label_end + 2)
+ return link_end + 1 if link_end != -1 else None
+ if next_char == "[":
+ ref_end = text.find("]", label_end + 2)
+ return ref_end + 1 if ref_end != -1 else None
+
+ return None if marker_start == start else None
+
+ @staticmethod
+ def _match_autolink(text: str, start: int) -> int | None:
+ if text.startswith(("", start + 1)
+ return end + 1 if end != -1 else None
+
+ if not (
+ text.startswith("http://", start) or text.startswith("https://", start)
+ ):
+ return None
+
+ end = start
+ while end < len(text) and not text[end].isspace():
+ end += 1
+ while end > start and text[end - 1] in ".,;:!?)>]":
+ end -= 1
+ return end
+
+ @staticmethod
+ def _match_inline_math(text: str, start: int) -> int | None:
+ if text.startswith(r"\(", start):
+ end = text.find(r"\)", start + 2)
+ return end + 2 if end != -1 else None
+
+ if text[start] != "$":
+ return None
+ if text.startswith("$$", start):
+ return None
+ if start > 0 and text[start - 1] == "\\":
+ return None
+ if start + 1 >= len(text) or text[start + 1].isspace():
+ return None
+
+ i = start + 1
+ while i < len(text):
+ if text[i] == "$" and text[i - 1] != "\\":
+ if i > start + 1 and not text[i - 1].isspace():
+ return i + 1
+ return None
+ i += 1
+ return None
+
+ @staticmethod
+ def _is_inline_protected_token(token: str) -> bool:
+ stripped = token.strip()
+ return (
+ stripped.startswith("[")
+ or stripped.startswith("![")
+ or stripped.startswith(" list[str]:
+ if chunk_size <= 0:
+ return [text]
+ return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
+
+ def _merge_heading_only_chunks(
+ self, raw_chunks: list[_ChunkDraft], chunk_size: int
+ ) -> list[MarkdownChunk]:
"""合并没有实质正文的 chunk 到下一个有正文的 chunk"""
- merged: list[str] = []
- pending = ""
+ merged: list[MarkdownChunk] = []
+ pending_text = ""
+ pending_title_path: list[str] | None = None
+ pending_section_index: int | None = None
- for chunk_text, has_body in raw_chunks:
+ for chunk in raw_chunks:
+ chunk_text = chunk.text
if not chunk_text:
continue
- if not has_body:
+ if not chunk.has_body:
# 纯标题节,暂存;但如果 pending 已经够长,先 flush
- if pending and len(pending) + len(chunk_text) + 2 > chunk_size:
- merged.append(pending.strip())
- pending = ""
- pending += chunk_text + "\n\n"
+ if (
+ pending_text
+ and len(pending_text) + len(chunk_text) + 2 > chunk_size
+ ):
+ merged.append(
+ MarkdownChunk(
+ text=pending_text.strip(),
+ title_path=pending_title_path,
+ section_index=pending_section_index,
+ )
+ )
+ pending_text = ""
+ pending_title_path = None
+ pending_section_index = None
+ pending_text += chunk_text + "\n\n"
+ pending_title_path = chunk.title_path or pending_title_path
+ pending_section_index = chunk.section_index
else:
- if pending:
- combined = pending + chunk_text
+ if pending_text:
+ combined = pending_text + chunk_text
if len(combined) <= chunk_size:
- merged.append(combined.strip())
+ merged.append(
+ MarkdownChunk(
+ text=combined.strip(),
+ title_path=chunk.title_path or pending_title_path,
+ section_index=chunk.section_index,
+ )
+ )
else:
- merged.append(pending.strip())
- merged.append(chunk_text.strip())
- pending = ""
+ merged.append(
+ MarkdownChunk(
+ text=pending_text.strip(),
+ title_path=pending_title_path,
+ section_index=pending_section_index,
+ )
+ )
+ merged.append(
+ MarkdownChunk(
+ text=chunk_text.strip(),
+ title_path=chunk.title_path,
+ section_index=chunk.section_index,
+ )
+ )
+ pending_text = ""
+ pending_title_path = None
+ pending_section_index = None
else:
- merged.append(chunk_text.strip())
+ merged.append(
+ MarkdownChunk(
+ text=chunk_text.strip(),
+ title_path=chunk.title_path,
+ section_index=chunk.section_index,
+ )
+ )
# 处理尾部残留的 pending
- if pending:
- pending_text = pending.strip()
- if merged and len(merged[-1] + "\n\n" + pending_text) <= chunk_size:
- merged[-1] = merged[-1] + "\n\n" + pending_text
+ if pending_text:
+ trailing_text = pending_text.strip()
+ if merged and len(merged[-1].text + "\n\n" + trailing_text) <= chunk_size:
+ merged[-1] = MarkdownChunk(
+ text=merged[-1].text + "\n\n" + trailing_text,
+ title_path=self._merge_title_paths(
+ [merged[-1].title_path, pending_title_path]
+ ),
+ section_index=self._merge_section_indexes(
+ [merged[-1].section_index, pending_section_index]
+ ),
+ )
else:
- merged.append(pending_text)
+ merged.append(
+ MarkdownChunk(
+ text=trailing_text,
+ title_path=pending_title_path,
+ section_index=pending_section_index,
+ )
+ )
- return [c for c in merged if c.strip()]
+ return [chunk for chunk in merged if chunk.text.strip()]
- def _merge_short_chunks(self, chunks: list[str], chunk_size: int) -> list[str]:
+ def _merge_short_chunks(
+ self, chunks: list[MarkdownChunk], chunk_size: int
+ ) -> list[MarkdownChunk]:
"""合并过短的相邻 chunk(低于 min_chunk_size)"""
if self.min_chunk_size <= 0 or len(chunks) <= 1:
return chunks
- final: list[str] = []
- buf = ""
+ final: list[MarkdownChunk] = []
+ buf: MarkdownChunk | None = None
- for c in chunks:
+ for chunk in chunks:
if buf:
- combined = buf + "\n\n" + c
+ combined = buf.text + "\n\n" + chunk.text
if len(combined) <= chunk_size:
- buf = combined
+ buf = MarkdownChunk(
+ text=combined,
+ title_path=self._merge_title_paths(
+ [buf.title_path, chunk.title_path]
+ ),
+ section_index=self._merge_section_indexes(
+ [buf.section_index, chunk.section_index]
+ ),
+ )
else:
final.append(buf)
- buf = c if len(c) < self.min_chunk_size else ""
- if len(c) >= self.min_chunk_size:
- final.append(c)
- elif len(c) < self.min_chunk_size:
- buf = c
+ if len(chunk.text) < self.min_chunk_size:
+ buf = chunk
+ else:
+ buf = None
+ final.append(chunk)
+ elif len(chunk.text) < self.min_chunk_size:
+ buf = chunk
else:
- final.append(c)
+ final.append(chunk)
if buf:
- if final and len(final[-1] + "\n\n" + buf) <= chunk_size:
- final[-1] = final[-1] + "\n\n" + buf
+ if final and len(final[-1].text + "\n\n" + buf.text) <= chunk_size:
+ final[-1] = MarkdownChunk(
+ text=final[-1].text + "\n\n" + buf.text,
+ title_path=self._merge_title_paths(
+ [final[-1].title_path, buf.title_path]
+ ),
+ section_index=self._merge_section_indexes(
+ [final[-1].section_index, buf.section_index]
+ ),
+ )
else:
final.append(buf)
return final
+ @staticmethod
+ def _normalize_title_path(title_path: list[str]) -> list[str] | None:
+ path = [title.strip() for title in title_path if title and title.strip()]
+ return path or None
+
+ @staticmethod
+ def _merge_title_paths(paths: list[list[str] | None]) -> list[str] | None:
+ non_empty_paths = [path for path in paths if path]
+ if not non_empty_paths:
+ return None
+
+ common = list(non_empty_paths[0])
+ for path in non_empty_paths[1:]:
+ prefix: list[str] = []
+ for left, right in zip(common, path, strict=False):
+ if left != right:
+ break
+ prefix.append(left)
+ common = prefix
+ if not common:
+ return None
+ return common
+
+ @staticmethod
+ def _merge_section_indexes(indexes: list[int | None]) -> int | None:
+ non_empty_indexes = [index for index in indexes if index is not None]
+ if not non_empty_indexes:
+ return None
+ first_index = non_empty_indexes[0]
+ if all(index == first_index for index in non_empty_indexes):
+ return first_index
+ return None
+
def _parse_sections(self, text: str) -> list[_Section]:
"""解析 Markdown 文本为章节列表
@@ -264,11 +1039,21 @@ def _parse_sections(self, text: str) -> list[_Section]:
return []
sections: list[_Section] = []
+ section_index = 0
# 处理第一个标题之前的内容(如果有)
preamble = text[: headings[0]["start"]].strip()
if preamble:
- sections.append(_Section(heading_path=[], text=preamble, has_body=True))
+ sections.append(
+ _Section(
+ heading_path=[],
+ title_path=[],
+ section_index=section_index,
+ text=preamble,
+ has_body=True,
+ )
+ )
+ section_index += 1
# 维护标题栈来追踪层级路径
heading_stack: list[dict] = []
@@ -297,14 +1082,18 @@ def _parse_sections(self, text: str) -> list[_Section]:
# 构建标题路径
heading_path = [h["title"] for h in heading_stack[:-1]]
+ title_path = [h["title"] for h in heading_stack]
sections.append(
_Section(
heading_path=heading_path,
+ title_path=title_path,
+ section_index=section_index,
text=section_text,
has_body=bool(body),
)
)
+ section_index += 1
return sections
diff --git a/astrbot/core/knowledge_base/document_metadata.py b/astrbot/core/knowledge_base/document_metadata.py
new file mode 100644
index 0000000000..4c78efe410
--- /dev/null
+++ b/astrbot/core/knowledge_base/document_metadata.py
@@ -0,0 +1,61 @@
+"""Helpers for knowledge-base document governance metadata."""
+
+import hashlib
+import re
+import uuid
+from pathlib import Path
+
+from .chunking.base import BaseChunker
+from .parsers.base import BaseParser
+
+DEFAULT_PARSER_VERSION = "1"
+DEFAULT_CHUNKER_VERSION = "1"
+
+
+def build_content_hash(content: bytes | str | list[str]) -> str:
+ """Return a stable SHA256 hash for source content."""
+ digest = hashlib.sha256()
+ if isinstance(content, bytes):
+ digest.update(content)
+ elif isinstance(content, str):
+ digest.update(content.encode("utf-8"))
+ else:
+ for chunk in content:
+ digest.update(chunk.encode("utf-8"))
+ digest.update(b"\x00")
+ return digest.hexdigest()
+
+
+def get_parser_name(parser: BaseParser | None) -> str | None:
+ if parser is None:
+ return None
+ return parser.__class__.__name__
+
+
+def get_chunker_name(chunker: BaseChunker | None) -> str | None:
+ if chunker is None:
+ return None
+ return chunker.__class__.__name__
+
+
+def sanitize_source_filename(file_name: str | None, fallback_suffix: str = "") -> str:
+ """Return a filename safe for storage inside a KB-owned directory."""
+ raw = (file_name or "").replace("\\", "/").split("/")[-1].replace("\x00", "")
+ safe = re.sub(r"[^A-Za-z0-9._ -]", "_", raw).strip(" .")
+ if not safe:
+ safe = f"document_{uuid.uuid4().hex[:8]}{fallback_suffix}"
+ return safe[:255]
+
+
+def build_stored_source_path(
+ files_dir: Path,
+ *,
+ doc_id: str,
+ file_name: str,
+ file_type: str,
+) -> Path:
+ suffix = Path(file_name).suffix
+ if not suffix and file_type:
+ suffix = f".{file_type}"
+ safe_name = sanitize_source_filename(file_name, fallback_suffix=suffix)
+ return files_dir / doc_id / safe_name
diff --git a/astrbot/core/knowledge_base/kb_db_sqlite.py b/astrbot/core/knowledge_base/kb_db_sqlite.py
index 2734ccb8d9..10f82e5635 100644
--- a/astrbot/core/knowledge_base/kb_db_sqlite.py
+++ b/astrbot/core/knowledge_base/kb_db_sqlite.py
@@ -1,8 +1,11 @@
+import asyncio
+import json
from contextlib import asynccontextmanager
+from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING
-from sqlalchemy import delete, event, func, select, text, update
+from sqlalchemy import delete, event, func, or_, select, text, update
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.pool import NullPool
from sqlmodel import col, desc
@@ -11,6 +14,7 @@
from astrbot.core.knowledge_base.models import (
BaseKBModel,
KBDocument,
+ KBIngestionTask,
KBMedia,
KnowledgeBase,
)
@@ -19,6 +23,8 @@
if TYPE_CHECKING:
from astrbot.core.db.vec_db.faiss_impl import FaissVecDB
+_UNSET = object()
+
def _configure_sqlite_connection(dbapi_connection, connection_record) -> None:
cursor = dbapi_connection.cursor()
@@ -106,6 +112,15 @@ async def migrate_to_v1(self) -> None:
async with self.get_db() as session:
session: AsyncSession
async with session.begin():
+ await self._ensure_column(
+ session,
+ table_name="knowledge_bases",
+ column_name="index_type",
+ column_sql="index_type TEXT DEFAULT 'flat'",
+ )
+ await self._ensure_document_governance_columns(session)
+ await self._ensure_ingestion_task_table(session)
+
# 创建知识库表索引
await session.execute(
text(
@@ -157,6 +172,24 @@ async def migrate_to_v1(self) -> None:
"ON kb_documents(created_at)",
),
)
+ await session.execute(
+ text(
+ "CREATE INDEX IF NOT EXISTS idx_doc_content_hash "
+ "ON kb_documents(content_hash)",
+ ),
+ )
+ await session.execute(
+ text(
+ "CREATE INDEX IF NOT EXISTS idx_doc_status "
+ "ON kb_documents(status)",
+ ),
+ )
+ await session.execute(
+ text(
+ "CREATE INDEX IF NOT EXISTS idx_doc_parent_doc_id "
+ "ON kb_documents(parent_doc_id)",
+ ),
+ )
# 创建多媒体表索引
await session.execute(
@@ -182,9 +215,126 @@ async def migrate_to_v1(self) -> None:
"ON kb_media(media_type)",
),
)
+ await self._ensure_ingestion_task_indexes(session)
await session.commit()
+ async def _ensure_column(
+ self,
+ session: AsyncSession,
+ *,
+ table_name: str,
+ column_name: str,
+ column_sql: str,
+ ) -> None:
+ """Add a column when upgrading an existing SQLite table."""
+ result = await session.execute(text(f"PRAGMA table_xinfo({table_name})"))
+ columns = {row[1] for row in result.fetchall()}
+ if column_name in columns:
+ return
+ logger.info(
+ f"知识库数据库迁移: 为表 {table_name} 添加列 {column_name}",
+ )
+ await session.execute(text(f"ALTER TABLE {table_name} ADD COLUMN {column_sql}"))
+
+ async def _ensure_document_governance_columns(
+ self,
+ session: AsyncSession,
+ ) -> None:
+ columns = {
+ "source_type": "source_type TEXT NOT NULL DEFAULT 'file'",
+ "source_uri": "source_uri TEXT",
+ "content_hash": "content_hash VARCHAR(64)",
+ "parser_name": "parser_name VARCHAR(100)",
+ "parser_version": "parser_version VARCHAR(50)",
+ "chunker_name": "chunker_name VARCHAR(100)",
+ "chunker_version": "chunker_version VARCHAR(50)",
+ "status": "status TEXT NOT NULL DEFAULT 'ready'",
+ "error_stage": "error_stage VARCHAR(50)",
+ "error_message": "error_message TEXT",
+ "version": "version INTEGER NOT NULL DEFAULT 1",
+ "parent_doc_id": "parent_doc_id VARCHAR(36)",
+ "indexed_at": "indexed_at DATETIME",
+ }
+ for column_name, column_sql in columns.items():
+ await self._ensure_column(
+ session,
+ table_name="kb_documents",
+ column_name=column_name,
+ column_sql=column_sql,
+ )
+
+ async def _ensure_ingestion_task_table(self, session: AsyncSession) -> None:
+ await session.execute(
+ text(
+ """
+ CREATE TABLE IF NOT EXISTS kb_ingestion_tasks (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ task_id VARCHAR(36) NOT NULL UNIQUE,
+ kb_id VARCHAR(36) NOT NULL,
+ task_type VARCHAR(30) NOT NULL,
+ status VARCHAR(20) NOT NULL DEFAULT 'pending',
+ progress_stage VARCHAR(50),
+ progress_current INTEGER NOT NULL DEFAULT 0,
+ progress_total INTEGER NOT NULL DEFAULT 100,
+ progress TEXT,
+ result TEXT,
+ error TEXT,
+ created_at DATETIME NOT NULL,
+ updated_at DATETIME NOT NULL
+ )
+ """,
+ ),
+ )
+
+ async def _ensure_ingestion_task_indexes(self, session: AsyncSession) -> None:
+ indexes = {
+ "idx_task_task_id": "task_id",
+ "idx_task_kb_id": "kb_id",
+ "idx_task_type": "task_type",
+ "idx_task_status": "status",
+ "idx_task_created_at": "created_at",
+ }
+ for index_name, column_name in indexes.items():
+ await session.execute(
+ text(
+ f"CREATE INDEX IF NOT EXISTS {index_name} "
+ f"ON kb_ingestion_tasks({column_name})",
+ ),
+ )
+
+ @staticmethod
+ def _encode_json(value) -> str | None:
+ if value is None:
+ return None
+ return json.dumps(value, ensure_ascii=False, default=str)
+
+ @staticmethod
+ def _decode_json(value: str | None):
+ if value is None:
+ return None
+ try:
+ return json.loads(value)
+ except json.JSONDecodeError:
+ return value
+
+ @classmethod
+ def _task_to_dict(cls, task: KBIngestionTask) -> dict:
+ return {
+ "task_id": task.task_id,
+ "kb_id": task.kb_id,
+ "task_type": task.task_type,
+ "status": task.status,
+ "progress_stage": task.progress_stage,
+ "progress_current": task.progress_current,
+ "progress_total": task.progress_total,
+ "progress": cls._decode_json(task.progress),
+ "result": cls._decode_json(task.result),
+ "error": cls._decode_json(task.error),
+ "created_at": task.created_at.isoformat(),
+ "updated_at": task.updated_at.isoformat(),
+ }
+
async def close(self) -> None:
"""关闭数据库连接"""
await self.engine.dispose()
@@ -204,15 +354,22 @@ async def get_kb_by_name(self, kb_name: str) -> KnowledgeBase | None:
result = await session.execute(stmt)
return result.scalar_one_or_none()
- async def list_kbs(self, offset: int = 0, limit: int = 100) -> list[KnowledgeBase]:
+ async def list_kbs(
+ self,
+ offset: int = 0,
+ limit: int | None = None,
+ ) -> list[KnowledgeBase]:
"""列出所有知识库"""
async with self.get_db() as session:
stmt = (
select(KnowledgeBase)
.offset(offset)
- .limit(limit)
- .order_by(desc(KnowledgeBase.created_at))
+ .order_by(
+ desc(KnowledgeBase.created_at),
+ )
)
+ if limit is not None:
+ stmt = stmt.limit(limit)
result = await session.execute(stmt)
return list(result.scalars().all())
@@ -223,6 +380,146 @@ async def count_kbs(self) -> int:
result = await session.execute(stmt)
return result.scalar() or 0
+ # ===== 任务查询 =====
+
+ async def create_ingestion_task(
+ self,
+ *,
+ task_id: str,
+ kb_id: str,
+ task_type: str,
+ status: str = "pending",
+ progress_stage: str | None = None,
+ progress_current: int = 0,
+ progress_total: int = 100,
+ progress: dict | None = None,
+ ) -> dict:
+ task = KBIngestionTask(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type=task_type,
+ status=status,
+ progress_stage=progress_stage,
+ progress_current=progress_current,
+ progress_total=progress_total,
+ progress=self._encode_json(progress),
+ )
+ async with self.get_db() as session:
+ session.add(task)
+ await session.commit()
+ await session.refresh(task)
+ return self._task_to_dict(task)
+
+ async def update_ingestion_task(
+ self,
+ task_id: str,
+ *,
+ status: str | object = _UNSET,
+ progress_stage: str | None | object = _UNSET,
+ progress_current: int | object = _UNSET,
+ progress_total: int | object = _UNSET,
+ progress: dict | None | object = _UNSET,
+ result: dict | None | object = _UNSET,
+ error: str | None | object = _UNSET,
+ ) -> dict | None:
+ async with self.get_db() as session:
+ stmt = select(KBIngestionTask).where(
+ col(KBIngestionTask.task_id) == task_id,
+ )
+ query_result = await session.execute(stmt)
+ task = query_result.scalar_one_or_none()
+ if task is None:
+ return None
+
+ if status is not _UNSET:
+ task.status = status # type: ignore[assignment]
+ if progress_stage is not _UNSET:
+ task.progress_stage = progress_stage # type: ignore[assignment]
+ if progress_current is not _UNSET:
+ task.progress_current = progress_current # type: ignore[assignment]
+ if progress_total is not _UNSET:
+ task.progress_total = progress_total # type: ignore[assignment]
+ if progress is not _UNSET:
+ task.progress = self._encode_json(progress)
+ if result is not _UNSET:
+ task.result = self._encode_json(result)
+ if error is not _UNSET:
+ task.error = self._encode_json(error)
+ task.updated_at = datetime.now(timezone.utc)
+
+ session.add(task)
+ await session.commit()
+ await session.refresh(task)
+ return self._task_to_dict(task)
+
+ async def get_ingestion_task(self, task_id: str) -> dict | None:
+ async with self.get_db() as session:
+ stmt = select(KBIngestionTask).where(
+ col(KBIngestionTask.task_id) == task_id,
+ )
+ result = await session.execute(stmt)
+ task = result.scalar_one_or_none()
+ return self._task_to_dict(task) if task is not None else None
+
+ @staticmethod
+ def _build_ingestion_task_conditions(
+ *,
+ kb_id: str | None = None,
+ status: str | None = None,
+ task_type: str | None = None,
+ ) -> list:
+ conditions = []
+ if kb_id is not None:
+ conditions.append(col(KBIngestionTask.kb_id) == kb_id)
+ if status is not None:
+ conditions.append(col(KBIngestionTask.status) == status)
+ if task_type is not None:
+ conditions.append(col(KBIngestionTask.task_type) == task_type)
+ return conditions
+
+ async def list_ingestion_tasks(
+ self,
+ *,
+ kb_id: str | None = None,
+ status: str | None = None,
+ task_type: str | None = None,
+ offset: int = 0,
+ limit: int = 100,
+ ) -> list[dict]:
+ conditions = self._build_ingestion_task_conditions(
+ kb_id=kb_id,
+ status=status,
+ task_type=task_type,
+ )
+
+ async with self.get_db() as session:
+ stmt = (
+ select(KBIngestionTask)
+ .where(*conditions)
+ .offset(offset)
+ .limit(limit)
+ .order_by(desc(KBIngestionTask.created_at))
+ )
+ result = await session.execute(stmt)
+ return [self._task_to_dict(task) for task in result.scalars().all()]
+
+ async def count_ingestion_tasks(
+ self,
+ *,
+ kb_id: str | None = None,
+ status: str | None = None,
+ task_type: str | None = None,
+ ) -> int:
+ conditions = self._build_ingestion_task_conditions(
+ kb_id=kb_id,
+ status=status,
+ task_type=task_type,
+ )
+ async with self.get_db() as session:
+ stmt = select(func.count(col(KBIngestionTask.id))).where(*conditions)
+ result = await session.execute(stmt)
+ return result.scalar() or 0
+
# ===== 文档查询 =====
async def get_document_by_id(self, doc_id: str) -> KBDocument | None:
@@ -232,17 +529,70 @@ async def get_document_by_id(self, doc_id: str) -> KBDocument | None:
result = await session.execute(stmt)
return result.scalar_one_or_none()
+ async def get_document_by_content_hash(
+ self,
+ *,
+ kb_id: str,
+ content_hash: str,
+ ) -> KBDocument | None:
+ """Return an existing active document with the same source content hash."""
+ async with self.get_db() as session:
+ stmt = (
+ select(KBDocument)
+ .where(
+ col(KBDocument.kb_id) == kb_id,
+ col(KBDocument.content_hash) == content_hash,
+ col(KBDocument.status) != "failed",
+ )
+ .order_by(desc(KBDocument.created_at))
+ .limit(1)
+ )
+ result = await session.execute(stmt)
+ return result.scalar_one_or_none()
+
+ @staticmethod
+ def _build_document_filters(
+ *,
+ kb_id: str,
+ search: str | None = None,
+ status: str | None = None,
+ source_type: str | None = None,
+ ) -> list:
+ conditions = [col(KBDocument.kb_id) == kb_id]
+ if search:
+ pattern = f"%{search}%"
+ conditions.append(
+ or_(
+ col(KBDocument.doc_name).ilike(pattern),
+ col(KBDocument.file_type).ilike(pattern),
+ ),
+ )
+ if status:
+ conditions.append(col(KBDocument.status) == status)
+ if source_type:
+ conditions.append(col(KBDocument.source_type) == source_type)
+ return conditions
+
async def list_documents_by_kb(
self,
kb_id: str,
offset: int = 0,
limit: int = 100,
+ search: str | None = None,
+ status: str | None = None,
+ source_type: str | None = None,
) -> list[KBDocument]:
"""列出知识库的所有文档"""
async with self.get_db() as session:
+ conditions = self._build_document_filters(
+ kb_id=kb_id,
+ search=search,
+ status=status,
+ source_type=source_type,
+ )
stmt = (
select(KBDocument)
- .where(col(KBDocument.kb_id) == kb_id)
+ .where(*conditions)
.offset(offset)
.limit(limit)
.order_by(desc(KBDocument.created_at))
@@ -250,12 +600,22 @@ async def list_documents_by_kb(
result = await session.execute(stmt)
return list(result.scalars().all())
- async def count_documents_by_kb(self, kb_id: str) -> int:
+ async def count_documents_by_kb(
+ self,
+ kb_id: str,
+ search: str | None = None,
+ status: str | None = None,
+ source_type: str | None = None,
+ ) -> int:
"""统计知识库的文档数量"""
async with self.get_db() as session:
- stmt = select(func.count(col(KBDocument.id))).where(
- col(KBDocument.kb_id) == kb_id,
+ conditions = self._build_document_filters(
+ kb_id=kb_id,
+ search=search,
+ status=status,
+ source_type=source_type,
)
+ stmt = select(func.count(col(KBDocument.id))).where(*conditions)
result = await session.execute(stmt)
return result.scalar() or 0
@@ -317,17 +677,98 @@ async def get_documents_with_metadata_batch(
return metadata_map
- async def delete_document_by_id(self, doc_id: str, vec_db: "FaissVecDB") -> None:
+ async def delete_document_by_id(
+ self,
+ doc_id: str,
+ vec_db: "FaissVecDB",
+ kb_id: str | None = None,
+ ) -> bool:
"""删除单个文档及其相关数据"""
- # 在知识库表中删除
+ doc = await self.get_document_by_id(doc_id)
+ if not doc or (kb_id is not None and doc.kb_id != kb_id):
+ return False
+
+ metadata_filters = {"kb_doc_id": doc_id}
+ if kb_id is not None:
+ metadata_filters["kb_id"] = kb_id
+
+ # 先删向量库;如果失败,保留 metadata 以便重试/修复。
+ await vec_db.delete_documents(metadata_filters=metadata_filters)
+
async with self.get_db() as session, session.begin():
- # 删除文档记录
delete_stmt = delete(KBDocument).where(col(KBDocument.doc_id) == doc_id)
+ if kb_id is not None:
+ delete_stmt = delete_stmt.where(col(KBDocument.kb_id) == kb_id)
await session.execute(delete_stmt)
- await session.commit()
+ await session.execute(delete(KBMedia).where(col(KBMedia.doc_id) == doc_id))
+
+ return True
+
+ async def delete_documents_by_ids(
+ self,
+ doc_ids: list[str],
+ vec_db: "FaissVecDB",
+ kb_id: str | None = None,
+ ) -> dict[str, bool]:
+ """批量删除文档及其向量数据。
+
+ 先删除向量数据,再删除 metadata;单个文档的 vec_db 删除失败
+ 不影响其他文档(best-effort),失败项保留 metadata 以便重试。
+ """
+ if not doc_ids:
+ return {}
+
+ requested_doc_ids = list(dict.fromkeys(doc_ids))
+ results = dict.fromkeys(requested_doc_ids, False)
+
+ candidates = requested_doc_ids
+ if kb_id is not None:
+ async with self.get_db() as session:
+ stmt = select(KBDocument.doc_id).where(
+ col(KBDocument.doc_id).in_(requested_doc_ids),
+ col(KBDocument.kb_id) == kb_id,
+ )
+ result = await session.execute(stmt)
+ candidates = [row[0] for row in result.fetchall()]
+
+ if not candidates:
+ return results
+
+ async def _delete_one(doc_id: str) -> tuple[str, bool]:
+ metadata_filters = {"kb_doc_id": doc_id}
+ if kb_id is not None:
+ metadata_filters["kb_id"] = kb_id
+ try:
+ await vec_db.delete_documents(metadata_filters=metadata_filters)
+ return doc_id, True
+ except Exception as e:
+ logger.error(
+ f"删除文档 {doc_id} 的向量数据失败: {e}",
+ )
+ return doc_id, False
+
+ vec_results = await asyncio.gather(
+ *[_delete_one(doc_id) for doc_id in candidates],
+ )
+ successful_doc_ids = []
+ for doc_id, success in vec_results:
+ results[doc_id] = success
+ if success:
+ successful_doc_ids.append(doc_id)
+
+ if successful_doc_ids:
+ async with self.get_db() as session, session.begin():
+ delete_stmt = delete(KBDocument).where(
+ col(KBDocument.doc_id).in_(successful_doc_ids),
+ )
+ if kb_id is not None:
+ delete_stmt = delete_stmt.where(col(KBDocument.kb_id) == kb_id)
+ await session.execute(delete_stmt)
+ await session.execute(
+ delete(KBMedia).where(col(KBMedia.doc_id).in_(successful_doc_ids)),
+ )
- # 在 vec db 中删除相关向量
- await vec_db.delete_documents(metadata_filters={"kb_doc_id": doc_id})
+ return results
# ===== 多媒体查询 =====
@@ -347,7 +788,7 @@ async def get_media_by_id(self, media_id: str) -> KBMedia | None:
async def update_kb_stats(self, kb_id: str, vec_db: "FaissVecDB") -> None:
"""更新知识库统计信息"""
- chunk_cnt = await vec_db.count_documents()
+ chunk_cnt = await vec_db.count_documents(metadata_filter={"kb_id": kb_id})
async with self.get_db() as session, session.begin():
update_stmt = (
@@ -363,3 +804,84 @@ async def update_kb_stats(self, kb_id: str, vec_db: "FaissVecDB") -> None:
await session.execute(update_stmt)
await session.commit()
+
+ async def get_kb_stats(self, kb_id: str) -> dict | None:
+ """Return persisted document statistics for a knowledge base."""
+ async with self.get_db() as session:
+ kb_result = await session.execute(
+ select(KnowledgeBase).where(col(KnowledgeBase.kb_id) == kb_id),
+ )
+ kb = kb_result.scalar_one_or_none()
+ if kb is None:
+ return None
+
+ status_result = await session.execute(
+ select(KBDocument.status, func.count(col(KBDocument.id)))
+ .where(col(KBDocument.kb_id) == kb_id)
+ .group_by(KBDocument.status),
+ )
+ status_counts = {
+ status or "unknown": count for status, count in status_result.all()
+ }
+
+ chunk_result = await session.execute(
+ select(func.coalesce(func.sum(col(KBDocument.chunk_count)), 0)).where(
+ col(KBDocument.kb_id) == kb_id,
+ ),
+ )
+ document_chunk_count = int(chunk_result.scalar() or 0)
+
+ media_result = await session.execute(
+ select(func.count(col(KBMedia.id))).where(col(KBMedia.kb_id) == kb_id),
+ )
+ media_count = int(media_result.scalar() or 0)
+ source_file_count_result = await session.execute(
+ select(func.count(col(KBDocument.id))).where(
+ col(KBDocument.kb_id) == kb_id,
+ col(KBDocument.source_type) == "file",
+ col(KBDocument.file_path) != "",
+ ),
+ )
+ source_file_count = int(source_file_count_result.scalar() or 0)
+ document_storage_result = await session.execute(
+ select(func.coalesce(func.sum(col(KBDocument.file_size)), 0)).where(
+ col(KBDocument.kb_id) == kb_id,
+ col(KBDocument.file_path) != "",
+ ),
+ )
+ document_storage_bytes = int(document_storage_result.scalar() or 0)
+ media_storage_result = await session.execute(
+ select(func.coalesce(func.sum(col(KBMedia.file_size)), 0)).where(
+ col(KBMedia.kb_id) == kb_id,
+ ),
+ )
+ media_storage_bytes = int(media_storage_result.scalar() or 0)
+
+ document_count = sum(status_counts.values())
+ ready_document_count = status_counts.get("ready", 0)
+ failed_document_count = status_counts.get("failed", 0)
+ pending_document_count = status_counts.get("pending", 0)
+ processing_document_count = sum(
+ status_counts.get(status, 0)
+ for status in ("parsing", "chunking", "embedding")
+ )
+
+ return {
+ "kb_id": kb.kb_id,
+ "kb_name": kb.kb_name,
+ "doc_count": kb.doc_count,
+ "chunk_count": kb.chunk_count,
+ "document_count": document_count,
+ "ready_document_count": ready_document_count,
+ "failed_document_count": failed_document_count,
+ "pending_document_count": pending_document_count,
+ "processing_document_count": processing_document_count,
+ "indexed_chunk_count": kb.chunk_count,
+ "document_chunk_count": document_chunk_count,
+ "media_count": media_count,
+ "source_file_count": source_file_count,
+ "storage_bytes": document_storage_bytes + media_storage_bytes,
+ "status_counts": status_counts,
+ "created_at": kb.created_at.isoformat(),
+ "updated_at": kb.updated_at.isoformat(),
+ }
diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py
index c29e45876d..36a597bdf0 100644
--- a/astrbot/core/knowledge_base/kb_helper.py
+++ b/astrbot/core/knowledge_base/kb_helper.py
@@ -3,6 +3,7 @@
import re
import time
import uuid
+from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING
@@ -11,7 +12,6 @@
from astrbot.core import logger
from astrbot.core.db.vec_db.base import BaseVecDB
from astrbot.core.exceptions import KnowledgeBaseUploadError
-from astrbot.core.provider.manager import ProviderManager
from astrbot.core.provider.provider import (
EmbeddingProvider,
RerankProvider,
@@ -20,17 +20,59 @@
Provider as LLMProvider,
)
+from .capabilities import (
+ DEFAULT_CHUNK_OVERLAP,
+ DEFAULT_CHUNK_SIZE,
+ DEFAULT_UPLOAD_BATCH_SIZE,
+ DEFAULT_UPLOAD_MAX_RETRIES,
+ DEFAULT_UPLOAD_TASKS_LIMIT,
+)
from .chunking.base import BaseChunker
from .chunking.markdown import MarkdownChunker
from .chunking.recursive import RecursiveCharacterChunker
+from .document_metadata import (
+ DEFAULT_CHUNKER_VERSION,
+ DEFAULT_PARSER_VERSION,
+ build_content_hash,
+ build_stored_source_path,
+ get_chunker_name,
+ get_parser_name,
+)
from .kb_db_sqlite import KBSQLiteDatabase
from .models import KBDocument, KBMedia, KnowledgeBase
-from .parsers.url_parser import extract_text_from_url
+from .parsers.base import TextSegment
+from .parsers.url_parser import URLExtractor, extract_text_from_url
from .parsers.util import select_parser
from .prompts import TEXT_REPAIR_SYSTEM_PROMPT
if TYPE_CHECKING:
from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB
+ from astrbot.core.provider.manager import ProviderManager
+
+
+DOCUMENT_REBUILD_PAGE_SIZE = 100
+CONSISTENCY_CHECK_PAGE_SIZE = 1000
+CONSISTENCY_REPAIR_TYPES = frozenset(
+ {
+ "orphan_vectors",
+ "chunk_count_mismatches",
+ },
+)
+NON_PERSISTED_FAILURE_STAGES = frozenset({"deduplication"})
+MARKDOWN_AWARE_EXTENSIONS = frozenset(
+ {
+ ".adoc",
+ ".docx",
+ ".epub",
+ ".md",
+ ".markdown",
+ ".mdx",
+ ".mkd",
+ ".rst",
+ ".xls",
+ ".xlsx",
+ },
+)
class RateLimiter:
@@ -40,18 +82,20 @@ def __init__(self, max_rpm: int) -> None:
self.max_per_minute = max_rpm
self.interval = 60.0 / max_rpm if max_rpm > 0 else 0
self.last_call_time = 0
+ self._lock = asyncio.Lock()
async def __aenter__(self):
if self.interval == 0:
return
- now = time.monotonic()
- elapsed = now - self.last_call_time
+ async with self._lock:
+ now = time.monotonic()
+ elapsed = now - self.last_call_time
- if elapsed < self.interval:
- await asyncio.sleep(self.interval - elapsed)
+ if elapsed < self.interval:
+ await asyncio.sleep(self.interval - elapsed)
- self.last_call_time = time.monotonic()
+ self.last_call_time = time.monotonic()
async def __aexit__(self, exc_type, exc_val, exc_tb):
pass
@@ -114,6 +158,114 @@ def _compact_chunks(chunks: list[str]) -> list[str]:
return [chunk.strip() for chunk in chunks if chunk and chunk.strip()]
+def _estimate_text_tokens(text: str) -> int:
+ chinese_count = sum(1 for char in text if "\u4e00" <= char <= "\u9fff")
+ other_count = len(text) - chinese_count
+ return int(chinese_count * 0.6 + other_count * 0.3)
+
+
+def _build_chunk_metadata(
+ *,
+ kb_id: str,
+ doc_id: str,
+ chunks_text: list[str],
+ chunk_ids: list[str],
+ chunk_extra_metadatas: list[dict] | None = None,
+) -> list[dict]:
+ if chunk_extra_metadatas is not None and len(chunk_extra_metadatas) != len(
+ chunks_text
+ ):
+ raise ValueError("chunk_extra_metadatas length must match chunks_text length")
+
+ metadatas = []
+ start_offset = 0
+ for idx, chunk_text in enumerate(chunks_text):
+ end_offset = start_offset + len(chunk_text)
+ metadata = {
+ "kb_id": kb_id,
+ "kb_doc_id": doc_id,
+ "chunk_index": idx,
+ "section_index": idx,
+ "content_hash": build_content_hash(chunk_text),
+ "char_count": len(chunk_text),
+ "token_count_estimate": _estimate_text_tokens(chunk_text),
+ "start_offset": start_offset,
+ "end_offset": end_offset,
+ "previous_chunk_id": chunk_ids[idx - 1] if idx > 0 else None,
+ "next_chunk_id": chunk_ids[idx + 1] if idx < len(chunk_ids) - 1 else None,
+ }
+ if chunk_extra_metadatas is not None:
+ metadata.update(chunk_extra_metadatas[idx])
+ metadatas.append(metadata)
+ start_offset = end_offset
+ return metadatas
+
+
+async def _chunk_text_with_metadata(
+ *,
+ chunker: BaseChunker,
+ text: str,
+ chunk_size: int,
+ chunk_overlap: int,
+ extra_metadata: dict | None = None,
+) -> tuple[list[str], list[dict] | None]:
+ chunks_text = await chunker.chunk(
+ text,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+ chunks_text = _compact_chunks(chunks_text)
+ if not chunks_text:
+ return [], [] if extra_metadata is not None else None
+ if extra_metadata is None:
+ return chunks_text, None
+ return chunks_text, [dict(extra_metadata) for _ in chunks_text]
+
+
+async def _chunk_text_segments_with_metadata(
+ *,
+ chunker: BaseChunker,
+ text_segments: list[TextSegment],
+ chunk_size: int,
+ chunk_overlap: int,
+) -> tuple[list[str], list[dict]]:
+ chunks_text: list[str] = []
+ chunk_extra_metadatas: list[dict] = []
+ for segment in text_segments:
+ segment_text = getattr(segment, "text", "")
+ segment_metadata = getattr(segment, "metadata", None) or {}
+ segment_chunks, segment_metadatas = await _chunk_text_with_metadata(
+ chunker=chunker,
+ text=segment_text,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ extra_metadata=segment_metadata,
+ )
+ chunks_text.extend(segment_chunks)
+ chunk_extra_metadatas.extend(segment_metadatas or [])
+ return chunks_text, chunk_extra_metadatas
+
+
+def _build_duplicate_document_error(
+ *,
+ file_name: str,
+ content_hash: str,
+ existing_doc: KBDocument,
+) -> KnowledgeBaseUploadError:
+ return KnowledgeBaseUploadError(
+ stage="deduplication",
+ user_message=(
+ f"重复文档:{file_name} 与已存在文档 {existing_doc.doc_name} 内容相同。"
+ ),
+ details={
+ "file_name": file_name,
+ "content_hash": content_hash,
+ "existing_doc_id": existing_doc.doc_id,
+ "existing_doc_name": existing_doc.doc_name,
+ },
+ )
+
+
class KBHelper:
vec_db: BaseVecDB
kb: KnowledgeBase
@@ -123,7 +275,7 @@ def __init__(
self,
kb_db: KBSQLiteDatabase,
kb: KnowledgeBase,
- provider_manager: ProviderManager,
+ provider_manager: "ProviderManager",
kb_root_dir: str,
chunker: BaseChunker,
) -> None:
@@ -133,6 +285,8 @@ def __init__(
self.kb_root_dir = kb_root_dir
self.chunker = chunker
self.init_error = None
+ self.init_retry_count = 0
+ self.last_init_retry_at = 0.0
self.kb_dir = Path(self.kb_root_dir) / self.kb.kb_id
self.kb_medias_dir = Path(self.kb_dir) / "medias" / self.kb.kb_id
@@ -189,6 +343,7 @@ async def _ensure_vec_db(self) -> "FaissVecDB":
index_store_path=str(self.kb_dir / "index.faiss"),
embedding_provider=ep,
rerank_provider=rp,
+ index_type=self.kb.index_type or "flat",
)
await vec_db.initialize()
self.vec_db = vec_db
@@ -208,18 +363,162 @@ async def terminate(self) -> None:
if hasattr(self, "vec_db") and self.vec_db:
await self.vec_db.close()
+ async def _ensure_not_duplicate_document(
+ self,
+ *,
+ file_name: str,
+ content_hash: str | None,
+ ) -> None:
+ if not content_hash:
+ return
+ try:
+ existing_doc = await self.kb_db.get_document_by_content_hash(
+ kb_id=self.kb.kb_id,
+ content_hash=content_hash,
+ )
+ except KnowledgeBaseUploadError:
+ raise
+ except Exception as exc:
+ raise KnowledgeBaseUploadError(
+ stage="deduplication",
+ user_message=("重复检测失败:无法确认文档是否已存在,请稍后重试。"),
+ details={"file_name": file_name, "content_hash": content_hash},
+ ) from exc
+ if existing_doc is not None:
+ raise _build_duplicate_document_error(
+ file_name=file_name,
+ content_hash=content_hash,
+ existing_doc=existing_doc,
+ )
+
+ @staticmethod
+ def _get_upload_failure_stage(error: Exception) -> str:
+ if isinstance(error, KnowledgeBaseUploadError):
+ return error.stage
+ return "unknown"
+
+ async def _persist_failed_document(
+ self,
+ *,
+ doc_id: str,
+ file_name: str,
+ file_type: str,
+ file_size: int,
+ stored_file_path: Path | None,
+ source_type: str,
+ source_uri: str,
+ content_hash: str | None,
+ parser_name: str | None,
+ chunker_name: str | None,
+ parent_doc_id: str | None,
+ document_version: int,
+ error: Exception,
+ ) -> bool:
+ """Persist a failed document record for ingestion diagnostics."""
+ error_stage = self._get_upload_failure_stage(error)
+ if error_stage in NON_PERSISTED_FAILURE_STAGES:
+ return False
+
+ failed_doc = KBDocument(
+ doc_id=doc_id,
+ kb_id=self.kb.kb_id,
+ doc_name=file_name,
+ file_type=file_type,
+ file_size=file_size,
+ file_path=str(stored_file_path) if stored_file_path else "",
+ source_type=source_type,
+ source_uri=source_uri,
+ content_hash=content_hash,
+ parser_name=parser_name,
+ parser_version=DEFAULT_PARSER_VERSION if parser_name else None,
+ chunker_name=chunker_name,
+ chunker_version=DEFAULT_CHUNKER_VERSION if chunker_name else None,
+ status="failed",
+ error_stage=error_stage,
+ error_message=str(error).strip() or error.__class__.__name__,
+ version=document_version,
+ parent_doc_id=parent_doc_id,
+ )
+
+ try:
+ async with self.kb_db.get_db() as session:
+ async with session.begin():
+ session.add(failed_doc)
+ await session.commit()
+ await session.refresh(failed_doc)
+ except Exception as persist_err:
+ logger.warning(
+ f"记录失败文档 {doc_id} 的元数据失败: {persist_err}",
+ )
+ return False
+
+ try:
+ await self.kb_db.update_kb_stats(
+ kb_id=self.kb.kb_id,
+ vec_db=self.vec_db, # type: ignore[arg-type]
+ )
+ await self.refresh_kb()
+ await self.refresh_document(doc_id)
+ except Exception as stats_err:
+ logger.warning(
+ f"刷新失败文档 {doc_id} 的知识库统计失败: {stats_err}",
+ )
+ return True
+
+ @staticmethod
+ def _build_url_file_name(url: str) -> str:
+ file_name = url.split("/")[-1] or f"document_from_{url}"
+ if not Path(file_name).suffix:
+ file_name += ".url"
+ return file_name
+
+ async def _persist_failed_url_document(
+ self,
+ *,
+ url: str,
+ text_content: str | None,
+ parent_doc_id: str | None,
+ document_version: int,
+ error: Exception,
+ ) -> bool:
+ return await self._persist_failed_document(
+ doc_id=str(uuid.uuid4()),
+ file_name=self._build_url_file_name(url),
+ file_type="url",
+ file_size=len(text_content) if text_content else 0,
+ stored_file_path=None,
+ source_type="url",
+ source_uri=url,
+ content_hash=(
+ build_content_hash(text_content) if text_content is not None else None
+ ),
+ parser_name=URLExtractor.__name__,
+ chunker_name=get_chunker_name(self.chunker),
+ parent_doc_id=parent_doc_id,
+ document_version=document_version,
+ error=error,
+ )
+
async def upload_document(
self,
file_name: str,
file_content: bytes | None,
file_type: str,
- chunk_size: int = 512,
- chunk_overlap: int = 50,
- batch_size: int = 32,
- tasks_limit: int = 3,
- max_retries: int = 3,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+ batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE,
+ tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT,
+ max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
progress_callback=None,
pre_chunked_text: list[str] | None = None,
+ source_type: str | None = None,
+ source_uri: str | None = None,
+ source_content_hash: str | None = None,
+ source_parser_name: str | None = None,
+ source_chunker_name: str | None = None,
+ parent_doc_id: str | None = None,
+ document_version: int = 1,
+ skip_duplicate_check: bool = False,
) -> KBDocument:
"""上传并处理文档(带原子性保证和失败清理)
@@ -242,20 +541,37 @@ async def upload_document(
await self._ensure_vec_db()
doc_id = str(uuid.uuid4())
media_paths: list[Path] = []
+ stored_file_path: Path | None = None
file_size = 0
-
- # file_path = self.kb_files_dir / f"{doc_id}.{file_type}"
- # async with aiofiles.open(file_path, "wb") as f:
- # await f.write(file_content)
+ vectors_stored = False # 标记向量是否已写入, 用于失败回滚
+ metadata_stored = False
+ failed_metadata_stored = False
+ effective_source_type = source_type or (
+ "import" if pre_chunked_text is not None else "file"
+ )
+ effective_source_uri = source_uri or file_name
+ content_hash: str | None = source_content_hash
+ parser_name: str | None = source_parser_name
+ chunker_name: str | None = source_chunker_name
try:
chunks_text = []
+ chunk_extra_metadatas: list[dict] | None = None
saved_media = []
if pre_chunked_text is not None:
# 如果提供了预分块文本,直接使用
chunks_text = _compact_chunks(pre_chunked_text)
file_size = sum(len(chunk) for chunk in chunks_text)
+ if content_hash is None:
+ content_hash = build_content_hash(chunks_text)
+ if chunker_name is None:
+ chunker_name = "pre_chunked"
+ if not skip_duplicate_check:
+ await self._ensure_not_duplicate_document(
+ file_name=file_name,
+ content_hash=content_hash,
+ )
logger.info(f"使用预分块文本进行上传,共 {len(chunks_text)} 个块。")
else:
# 否则,执行标准的文件解析和分块流程
@@ -265,6 +581,22 @@ async def upload_document(
)
file_size = len(file_content)
+ content_hash = build_content_hash(file_content)
+ if not skip_duplicate_check:
+ await self._ensure_not_duplicate_document(
+ file_name=file_name,
+ content_hash=content_hash,
+ )
+
+ stored_file_path = build_stored_source_path(
+ self.kb_files_dir,
+ doc_id=doc_id,
+ file_name=file_name,
+ file_type=file_type,
+ )
+ stored_file_path.parent.mkdir(parents=True, exist_ok=True)
+ async with aiofiles.open(stored_file_path, "wb") as f:
+ await f.write(file_content)
# 阶段1: 解析文档
if progress_callback:
@@ -272,6 +604,7 @@ async def upload_document(
try:
parser = await select_parser(f".{file_type}")
+ parser_name = get_parser_name(parser)
parse_result = await parser.parse(file_content, file_name)
except KnowledgeBaseUploadError:
raise
@@ -286,6 +619,7 @@ async def upload_document(
) from exc
text_content = parse_result.text
media_items = parse_result.media
+ text_segments = getattr(parse_result, "text_segments", None)
if not text_content or not text_content.strip():
raise KnowledgeBaseUploadError(
stage="parsing",
@@ -316,24 +650,58 @@ async def upload_document(
await progress_callback("chunking", 0, 100)
try:
- # 根据文件类型选择分块器:Markdown 文件使用结构感知分块
+ # Use structure-aware chunking for Markdown and MarkItDown output.
effective_chunker = self.chunker
file_ext = Path(file_name).suffix.lower() if file_name else ""
- if file_ext in (".md", ".markdown", ".mkd", ".mdx"):
+ if file_ext in MARKDOWN_AWARE_EXTENSIONS:
effective_chunker = MarkdownChunker(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
logger.info(
- f"检测到 Markdown 文件 '{file_name}',使用 MarkdownChunker 进行结构化分块"
+ f"检测到 Markdown 兼容文档 '{file_name}',使用 MarkdownChunker 进行结构化分块"
)
- chunks_text = await effective_chunker.chunk(
- text_content,
- chunk_size=chunk_size,
- chunk_overlap=chunk_overlap,
- )
- chunks_text = _compact_chunks(chunks_text)
+ chunker_name = get_chunker_name(effective_chunker)
+ if isinstance(effective_chunker, MarkdownChunker):
+ structured_chunks = await effective_chunker.chunk_with_metadata(
+ text_content,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+ chunks_text = []
+ chunk_extra_metadatas = []
+ for chunk in structured_chunks:
+ chunk_text = chunk.text.strip()
+ if not chunk_text:
+ continue
+ chunks_text.append(chunk_text)
+ chunk_extra_metadatas.append(
+ {
+ "title_path": chunk.title_path,
+ "section_index": chunk.section_index,
+ }
+ )
+ elif text_segments:
+ (
+ chunks_text,
+ chunk_extra_metadatas,
+ ) = await _chunk_text_segments_with_metadata(
+ chunker=effective_chunker,
+ text_segments=text_segments,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+ else:
+ (
+ chunks_text,
+ chunk_extra_metadatas,
+ ) = await _chunk_text_with_metadata(
+ chunker=effective_chunker,
+ text=text_content,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
except KnowledgeBaseUploadError:
raise
except Exception as exc:
@@ -363,16 +731,16 @@ async def upload_document(
)
contents = []
- metadatas = []
for idx, chunk_text in enumerate(chunks_text):
contents.append(chunk_text)
- metadatas.append(
- {
- "kb_id": self.kb.kb_id,
- "kb_doc_id": doc_id,
- "chunk_index": idx,
- },
- )
+ chunk_ids = [str(uuid.uuid4()) for _ in chunks_text]
+ metadatas = _build_chunk_metadata(
+ kb_id=self.kb.kb_id,
+ doc_id=doc_id,
+ chunks_text=chunks_text,
+ chunk_ids=chunk_ids,
+ chunk_extra_metadatas=chunk_extra_metadatas,
+ )
if progress_callback:
await progress_callback("chunking", 100, 100)
@@ -386,11 +754,13 @@ async def embedding_progress_callback(current, total) -> None:
await self.vec_db.insert_batch(
contents=contents,
metadatas=metadatas,
+ ids=chunk_ids,
batch_size=batch_size,
tasks_limit=tasks_limit,
max_retries=max_retries,
progress_callback=embedding_progress_callback,
)
+ vectors_stored = True
except KnowledgeBaseUploadError:
raise
except Exception as exc:
@@ -407,10 +777,20 @@ async def embedding_progress_callback(current, total) -> None:
doc_name=file_name,
file_type=file_type,
file_size=file_size,
- # file_path=str(file_path),
- file_path="",
+ file_path=str(stored_file_path) if stored_file_path else "",
+ source_type=effective_source_type,
+ source_uri=effective_source_uri,
+ content_hash=content_hash,
+ parser_name=parser_name,
+ parser_version=DEFAULT_PARSER_VERSION if parser_name else None,
+ chunker_name=chunker_name,
+ chunker_version=DEFAULT_CHUNKER_VERSION if chunker_name else None,
+ status="ready",
+ indexed_at=datetime.now(timezone.utc),
+ version=document_version,
+ parent_doc_id=parent_doc_id,
chunk_count=len(chunks_text),
- media_count=0,
+ media_count=len(saved_media),
)
try:
async with self.kb_db.get_db() as session:
@@ -419,6 +799,7 @@ async def embedding_progress_callback(current, total) -> None:
for media in saved_media:
session.add(media)
await session.commit()
+ metadata_stored = True
await session.refresh(doc)
except KnowledgeBaseUploadError:
@@ -453,15 +834,57 @@ async def embedding_progress_callback(current, total) -> None:
logger.warning(f"上传文档失败: {e}", extra={"details": e.details})
else:
logger.error(f"上传文档失败: {e}", exc_info=True)
- # if file_path.exists():
- # file_path.unlink()
- for media_path in media_paths:
+ # 回滚已写入的向量, 防止孤数据
+ if vectors_stored and not metadata_stored:
try:
- if media_path.exists():
- media_path.unlink()
- except Exception as me:
- logger.warning(f"清理多媒体文件失败 {media_path}: {me}")
+ vec_db: FaissVecDB = self.vec_db # type: ignore
+ await vec_db.delete_documents(
+ metadata_filters={"kb_doc_id": doc_id},
+ )
+ logger.info(f"已清理文档 {doc_id} 的孤数据向量")
+ except Exception as cleanup_err:
+ logger.error(
+ f"清理文档 {doc_id} 向量回滚失败: {cleanup_err}",
+ )
+
+ if not metadata_stored:
+ failed_metadata_stored = await self._persist_failed_document(
+ doc_id=doc_id,
+ file_name=file_name,
+ file_type=file_type,
+ file_size=file_size,
+ stored_file_path=stored_file_path,
+ source_type=effective_source_type,
+ source_uri=effective_source_uri,
+ content_hash=content_hash,
+ parser_name=parser_name,
+ chunker_name=chunker_name,
+ parent_doc_id=parent_doc_id,
+ document_version=document_version,
+ error=e,
+ )
+
+ if (
+ stored_file_path
+ and stored_file_path.exists()
+ and not metadata_stored
+ and not failed_metadata_stored
+ ):
+ try:
+ stored_file_path.unlink()
+ if stored_file_path.parent != self.kb_files_dir:
+ stored_file_path.parent.rmdir()
+ except Exception as fe:
+ logger.warning(f"清理原始文件失败 {stored_file_path}: {fe}")
+
+ if not metadata_stored:
+ for media_path in media_paths:
+ try:
+ if media_path.exists():
+ media_path.unlink()
+ except Exception as me:
+ logger.warning(f"清理多媒体文件失败 {media_path}: {me}")
raise
@@ -469,32 +892,379 @@ async def list_documents(
self,
offset: int = 0,
limit: int = 100,
+ search: str | None = None,
+ status: str | None = None,
+ source_type: str | None = None,
) -> list[KBDocument]:
"""列出知识库的所有文档"""
- docs = await self.kb_db.list_documents_by_kb(self.kb.kb_id, offset, limit)
+ docs = await self.kb_db.list_documents_by_kb(
+ self.kb.kb_id,
+ offset,
+ limit,
+ search,
+ status=status,
+ source_type=source_type,
+ )
return docs
+ async def count_documents(
+ self,
+ search: str | None = None,
+ status: str | None = None,
+ source_type: str | None = None,
+ ) -> int:
+ """统计知识库的所有文档数量"""
+ return await self.kb_db.count_documents_by_kb(
+ self.kb.kb_id,
+ search,
+ status=status,
+ source_type=source_type,
+ )
+
async def get_document(self, doc_id: str) -> KBDocument | None:
"""获取单个文档"""
doc = await self.kb_db.get_document_by_id(doc_id)
+ if doc and doc.kb_id != self.kb.kb_id:
+ return None
return doc
async def delete_document(self, doc_id: str) -> None:
"""删除单个文档及其相关数据"""
- await self.kb_db.delete_document_by_id(
+ doc = await self.get_document(doc_id)
+ if not doc:
+ raise ValueError(f"无法找到 ID 为 {doc_id} 的文档")
+ media_items = await self.kb_db.list_media_by_doc(doc_id)
+ deleted = await self.kb_db.delete_document_by_id(
doc_id=doc_id,
vec_db=self.vec_db, # type: ignore
+ kb_id=self.kb.kb_id,
)
+ if not deleted:
+ raise ValueError(f"无法找到 ID 为 {doc_id} 的文档")
+ self._cleanup_document_files(doc, media_items)
await self.kb_db.update_kb_stats(
kb_id=self.kb.kb_id,
vec_db=self.vec_db, # type: ignore
)
await self.refresh_kb()
+ async def delete_documents(self, doc_ids: list[str]) -> dict[str, bool]:
+ """批量删除文档,单次更新统计。
+
+ vec_db 删除失败不阻塞其他文档(best-effort)。
+ """
+ docs_by_id = {
+ doc_id: doc
+ for doc_id in dict.fromkeys(doc_ids)
+ if (doc := await self.get_document(doc_id)) is not None
+ }
+ media_by_doc_id = {
+ doc_id: await self.kb_db.list_media_by_doc(doc_id) for doc_id in docs_by_id
+ }
+ results = await self.kb_db.delete_documents_by_ids(
+ doc_ids=doc_ids,
+ vec_db=self.vec_db, # type: ignore
+ kb_id=self.kb.kb_id,
+ )
+ for doc_id, deleted in results.items():
+ if deleted and doc_id in docs_by_id:
+ self._cleanup_document_files(
+ docs_by_id[doc_id],
+ media_by_doc_id.get(doc_id, []),
+ )
+ await self.kb_db.update_kb_stats(
+ kb_id=self.kb.kb_id,
+ vec_db=self.vec_db, # type: ignore
+ )
+ await self.refresh_kb()
+ return results
+
+ async def rebuild_document(
+ self,
+ doc_id: str,
+ *,
+ chunk_size: int | None = None,
+ chunk_overlap: int | None = None,
+ batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE,
+ tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT,
+ max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
+ progress_callback=None,
+ ) -> KBDocument:
+ doc = await self.get_document(doc_id)
+ if not doc:
+ raise ValueError(f"无法找到 ID 为 {doc_id} 的文档")
+ next_version = (doc.version or 1) + 1
+ parent_doc_id = doc.parent_doc_id or doc.doc_id
+ effective_chunk_size = (
+ chunk_size
+ if chunk_size is not None
+ else self.kb.chunk_size or DEFAULT_CHUNK_SIZE
+ )
+ effective_chunk_overlap = (
+ chunk_overlap
+ if chunk_overlap is not None
+ else self.kb.chunk_overlap or DEFAULT_CHUNK_OVERLAP
+ )
+
+ if doc.source_type == "file" and doc.file_path:
+ source_path = Path(doc.file_path).resolve(strict=False)
+ files_root = self.kb_files_dir.resolve(strict=False)
+ if not source_path.is_relative_to(files_root) or not source_path.exists():
+ raise ValueError("无法找到可用于重建的原始文件")
+
+ rebuilt_doc = await self.upload_document(
+ file_name=doc.doc_name,
+ file_content=source_path.read_bytes(),
+ file_type=doc.file_type,
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ source_type=doc.source_type,
+ source_uri=doc.source_uri or doc.doc_name,
+ parent_doc_id=parent_doc_id,
+ document_version=next_version,
+ skip_duplicate_check=True,
+ )
+ elif doc.source_type == "url":
+ if not doc.source_uri:
+ raise ValueError("无法找到可用于重建的 URL 来源")
+ rebuilt_doc = await self.upload_from_url(
+ url=doc.source_uri,
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ parent_doc_id=parent_doc_id,
+ document_version=next_version,
+ skip_duplicate_check=True,
+ )
+ elif doc.source_type == "import":
+ imported_chunks = await self._get_import_rebuild_chunks(doc.doc_id)
+ if not imported_chunks:
+ raise ValueError("无法找到可用于重建的导入文本块")
+ rebuilt_doc = await self.upload_document(
+ file_name=doc.doc_name,
+ file_content=None,
+ file_type=doc.file_type,
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ pre_chunked_text=imported_chunks,
+ source_type="import",
+ source_uri=doc.source_uri or doc.doc_name,
+ source_content_hash=build_content_hash(imported_chunks),
+ source_chunker_name=doc.chunker_name or "pre_chunked",
+ parent_doc_id=parent_doc_id,
+ document_version=next_version,
+ skip_duplicate_check=True,
+ )
+ else:
+ raise ValueError("当前仅支持重建已保存原始文件、URL 或导入来源的文档")
+
+ try:
+ await self.delete_document(doc_id)
+ except Exception as exc:
+ try:
+ await self.delete_document(rebuilt_doc.doc_id)
+ except Exception as cleanup_exc:
+ logger.error(
+ f"重建文档 {doc_id} 后清理新版本失败: {cleanup_exc}",
+ )
+ raise KnowledgeBaseUploadError(
+ stage="rebuild",
+ user_message=(
+ "重建失败:新版本已生成,但替换旧文档时失败,已尝试回滚新版本。"
+ ),
+ details={
+ "doc_id": doc_id,
+ "new_doc_id": rebuilt_doc.doc_id,
+ },
+ ) from exc
+ return rebuilt_doc
+
+ async def _get_import_rebuild_chunks(self, doc_id: str) -> list[str]:
+ chunks: list[dict] = []
+ offset = 0
+ while True:
+ page = await self.get_chunks_by_doc_id(
+ doc_id,
+ offset=offset,
+ limit=DOCUMENT_REBUILD_PAGE_SIZE,
+ )
+ if not page:
+ break
+ chunks.extend(page)
+ if len(page) < DOCUMENT_REBUILD_PAGE_SIZE:
+ break
+ offset += DOCUMENT_REBUILD_PAGE_SIZE
+
+ chunks.sort(key=lambda chunk: int(chunk.get("chunk_index") or 0))
+ return [
+ chunk["content"]
+ for chunk in chunks
+ if isinstance(chunk.get("content"), str) and chunk["content"].strip()
+ ]
+
+ async def rebuild_all_documents(
+ self,
+ *,
+ chunk_size: int | None = None,
+ chunk_overlap: int | None = None,
+ batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE,
+ tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT,
+ max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
+ progress_callback=None,
+ ) -> dict:
+ docs: list[KBDocument] = []
+ offset = 0
+ while True:
+ page = await self.list_documents(
+ offset=offset,
+ limit=DOCUMENT_REBUILD_PAGE_SIZE,
+ )
+ docs.extend(page)
+ if len(page) < DOCUMENT_REBUILD_PAGE_SIZE:
+ break
+ offset += DOCUMENT_REBUILD_PAGE_SIZE
+
+ rebuilt_docs = []
+ failed_docs = []
+
+ total = len(docs)
+ for index, doc in enumerate(docs, start=1):
+ if progress_callback:
+ await progress_callback("rebuilding", index - 1, total)
+ try:
+ rebuilt = await self.rebuild_document(
+ doc.doc_id,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ )
+ rebuilt_docs.append(rebuilt.model_dump())
+ except Exception as e:
+ logger.error(f"重建文档 {doc.doc_id} 失败: {e}")
+ failed_docs.append(
+ {
+ "doc_id": doc.doc_id,
+ "doc_name": doc.doc_name,
+ "error": str(e),
+ },
+ )
+
+ if progress_callback:
+ await progress_callback("rebuilding", total, total)
+
+ return {
+ "rebuilt": rebuilt_docs,
+ "failed": failed_docs,
+ "total": total,
+ "success_count": len(rebuilt_docs),
+ "failed_count": len(failed_docs),
+ }
+
+ async def rebuild_documents(
+ self,
+ doc_ids: list[str],
+ *,
+ chunk_size: int | None = None,
+ chunk_overlap: int | None = None,
+ batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE,
+ tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT,
+ max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
+ progress_callback=None,
+ ) -> dict:
+ rebuilt_docs = []
+ failed_docs = []
+ normalized_doc_ids = list(dict.fromkeys(doc_ids))
+
+ total = len(normalized_doc_ids)
+ for index, doc_id in enumerate(normalized_doc_ids, start=1):
+ if progress_callback:
+ await progress_callback("rebuilding", index - 1, total)
+ try:
+ rebuilt = await self.rebuild_document(
+ doc_id,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ )
+ rebuilt_docs.append(rebuilt.model_dump())
+ except Exception as e:
+ logger.error(f"重建文档 {doc_id} 失败: {e}")
+ failed_doc = await self.get_document(doc_id)
+ failed_docs.append(
+ {
+ "doc_id": doc_id,
+ "doc_name": failed_doc.doc_name if failed_doc else doc_id,
+ "error": str(e),
+ },
+ )
+
+ if progress_callback:
+ await progress_callback("rebuilding", total, total)
+
+ return {
+ "rebuilt": rebuilt_docs,
+ "failed": failed_docs,
+ "total": total,
+ "success_count": len(rebuilt_docs),
+ "failed_count": len(failed_docs),
+ }
+
+ def _cleanup_document_files(
+ self,
+ doc: KBDocument,
+ media_items: list[KBMedia],
+ ) -> None:
+ file_paths: list[Path] = []
+ if doc.file_path:
+ file_paths.append(Path(doc.file_path))
+ file_paths.extend(Path(media.file_path) for media in media_items)
+
+ cleanup_roots = (
+ self.kb_files_dir.resolve(strict=False),
+ self.kb_medias_dir.resolve(strict=False),
+ )
+ for file_path in file_paths:
+ resolved_path = file_path.resolve(strict=False)
+ if not any(resolved_path.is_relative_to(root) for root in cleanup_roots):
+ logger.warning(
+ f"跳过清理知识库目录外文件: {resolved_path}",
+ )
+ continue
+ try:
+ if resolved_path.exists():
+ resolved_path.unlink()
+ parent = resolved_path.parent
+ if any(parent.is_relative_to(root) for root in cleanup_roots):
+ try:
+ parent.rmdir()
+ except OSError:
+ pass
+ except Exception as e:
+ logger.warning(f"清理知识库文件失败 {resolved_path}: {e}")
+
async def delete_chunk(self, chunk_id: str, doc_id: str) -> None:
"""删除单个文本块及其相关数据"""
vec_db: FaissVecDB = self.vec_db # type: ignore
- await vec_db.delete(chunk_id)
+ deleted = await vec_db.delete(chunk_id)
+ if not deleted:
+ raise ValueError(f"无法找到 ID 为 {chunk_id} 的文本块")
await self.kb_db.update_kb_stats(
kb_id=self.kb.kb_id,
vec_db=self.vec_db, # type: ignore
@@ -534,20 +1304,102 @@ async def get_chunks_by_doc_id(
offset=offset,
limit=limit,
)
- result = []
- for chunk in chunks:
- chunk_md = json.loads(chunk["metadata"])
- result.append(
- {
- "chunk_id": chunk["doc_id"],
- "doc_id": chunk_md["kb_doc_id"],
- "kb_id": chunk_md["kb_id"],
- "chunk_index": chunk_md["chunk_index"],
- "content": chunk["text"],
- "char_count": len(chunk["text"]),
- },
+ return [self._format_chunk_response(chunk) for chunk in chunks]
+
+ async def search_chunks_by_doc_id(
+ self,
+ doc_id: str,
+ search: str | None = None,
+ offset: int = 0,
+ limit: int = 100,
+ ) -> tuple[list[dict], int]:
+ """Search or list chunks for one document with a matching total."""
+ if not search:
+ chunks = await self.get_chunks_by_doc_id(
+ doc_id=doc_id,
+ offset=offset,
+ limit=limit,
+ )
+ return chunks, await self.get_chunk_count_by_doc_id(doc_id)
+
+ vec_db: FaissVecDB = self.vec_db # type: ignore
+ search_documents = getattr(vec_db.document_storage, "search_documents", None)
+ if search_documents is None:
+ return [], 0
+
+ result = await search_documents(
+ search,
+ metadata_filters={"kb_doc_id": doc_id},
+ offset=offset,
+ limit=limit,
+ )
+ if result is None:
+ return [], 0
+ chunks, total = result
+ return [self._format_chunk_response(chunk) for chunk in chunks], total
+
+ @staticmethod
+ def _format_chunk_response(chunk: dict) -> dict:
+ chunk_md = json.loads(chunk["metadata"])
+ char_count = chunk_md.get("char_count", len(chunk["text"]))
+ return {
+ "chunk_id": chunk["doc_id"],
+ "doc_id": chunk_md["kb_doc_id"],
+ "kb_id": chunk_md["kb_id"],
+ "chunk_index": chunk_md["chunk_index"],
+ "section_index": chunk_md.get("section_index"),
+ "content": chunk["text"],
+ "char_count": char_count,
+ "token_count_estimate": chunk_md.get("token_count_estimate"),
+ "content_hash": chunk_md.get("content_hash"),
+ "start_offset": chunk_md.get("start_offset"),
+ "end_offset": chunk_md.get("end_offset"),
+ "previous_chunk_id": chunk_md.get("previous_chunk_id"),
+ "next_chunk_id": chunk_md.get("next_chunk_id"),
+ "title_path": chunk_md.get("title_path"),
+ "page_number": chunk_md.get("page_number"),
+ "parent_chunk_id": chunk_md.get("parent_chunk_id"),
+ }
+
+ async def get_chunk_by_id(
+ self,
+ chunk_id: str,
+ doc_id: str | None = None,
+ ) -> dict | None:
+ """获取单个文本块及其元数据"""
+ vec_db: FaissVecDB = self.vec_db # type: ignore
+ chunk = await vec_db.document_storage.get_document_by_doc_id(chunk_id)
+ if not chunk:
+ return None
+ formatted_chunk = self._format_chunk_response(chunk)
+ if doc_id and formatted_chunk["doc_id"] != doc_id:
+ return None
+ return formatted_chunk
+
+ async def get_chunk_context(self, chunk_id: str, doc_id: str) -> dict:
+ """获取文本块和相邻上下文块"""
+ current = await self.get_chunk_by_id(chunk_id, doc_id)
+ if not current:
+ raise ValueError(f"无法找到 ID 为 {chunk_id} 的文本块")
+
+ previous_chunk = None
+ next_chunk = None
+ if current.get("previous_chunk_id"):
+ previous_chunk = await self.get_chunk_by_id(
+ current["previous_chunk_id"],
+ doc_id,
+ )
+ if current.get("next_chunk_id"):
+ next_chunk = await self.get_chunk_by_id(
+ current["next_chunk_id"],
+ doc_id,
)
- return result
+
+ return {
+ "previous": previous_chunk,
+ "current": current,
+ "next": next_chunk,
+ }
async def get_chunk_count_by_doc_id(self, doc_id: str) -> int:
"""获取文档的块数量"""
@@ -555,6 +1407,434 @@ async def get_chunk_count_by_doc_id(self, doc_id: str) -> int:
count = await vec_db.count_documents(metadata_filter={"kb_doc_id": doc_id})
return count
+ async def check_consistency(self) -> dict:
+ """Return a read-only consistency report for document metadata and chunks."""
+ docs = await self._list_all_documents_for_consistency()
+ doc_by_id = {doc.doc_id: doc for doc in docs}
+ stored_chunks = await self._list_all_chunks_for_consistency()
+
+ chunks_by_doc_id: dict[str, list[dict]] = {}
+ orphan_vectors: list[dict] = []
+ invalid_vector_metadata: list[dict] = []
+
+ for chunk in stored_chunks:
+ try:
+ metadata = self._parse_stored_chunk_metadata(chunk)
+ except ValueError as exc:
+ invalid_vector_metadata.append(
+ self._format_vector_issue(chunk, metadata_error=str(exc)),
+ )
+ continue
+
+ doc_id = metadata.get("kb_doc_id")
+ if not isinstance(doc_id, str) or not doc_id:
+ invalid_vector_metadata.append(
+ self._format_vector_issue(
+ chunk,
+ metadata=metadata,
+ metadata_error="missing kb_doc_id",
+ ),
+ )
+ continue
+
+ if doc_id not in doc_by_id:
+ orphan_vectors.append(
+ self._format_vector_issue(chunk, metadata=metadata),
+ )
+ continue
+
+ chunks_by_doc_id.setdefault(doc_id, []).append(chunk)
+
+ missing_vectors: list[dict] = []
+ chunk_count_mismatches: list[dict] = []
+ for doc in docs:
+ expected_chunk_count = int(doc.chunk_count or 0)
+ actual_chunk_count = len(chunks_by_doc_id.get(doc.doc_id, []))
+ if expected_chunk_count > 0 and actual_chunk_count == 0:
+ missing_vectors.append(
+ self._format_document_issue(
+ doc,
+ expected_chunk_count=expected_chunk_count,
+ actual_chunk_count=actual_chunk_count,
+ ),
+ )
+ if expected_chunk_count != actual_chunk_count:
+ chunk_count_mismatches.append(
+ self._format_document_issue(
+ doc,
+ expected_chunk_count=expected_chunk_count,
+ actual_chunk_count=actual_chunk_count,
+ ),
+ )
+
+ missing_source_files, unsafe_source_paths, source_file_count = (
+ self._check_source_file_consistency(docs)
+ )
+
+ status_counts: dict[str, int] = {}
+ for doc in docs:
+ status = doc.status or "unknown"
+ status_counts[status] = status_counts.get(status, 0) + 1
+
+ issues = {
+ "missing_vectors": missing_vectors,
+ "orphan_vectors": orphan_vectors,
+ "missing_source_files": missing_source_files,
+ "chunk_count_mismatches": chunk_count_mismatches,
+ "invalid_vector_metadata": invalid_vector_metadata,
+ "unsafe_source_paths": unsafe_source_paths,
+ }
+ issue_counts = {name: len(items) for name, items in issues.items()}
+
+ return {
+ "kb_id": self.kb.kb_id,
+ "kb_name": self.kb.kb_name,
+ "checked_at": datetime.now(timezone.utc).isoformat(),
+ "summary": {
+ "sqlite_document_count": len(docs),
+ "ready_document_count": status_counts.get("ready", 0),
+ "failed_document_count": status_counts.get("failed", 0),
+ "document_chunk_count": sum(int(doc.chunk_count or 0) for doc in docs),
+ "indexed_chunk_count": len(stored_chunks),
+ "source_file_count": source_file_count,
+ "status_counts": status_counts,
+ **issue_counts,
+ "healthy": all(count == 0 for count in issue_counts.values()),
+ },
+ "issues": issues,
+ }
+
+ async def repair_consistency(
+ self,
+ repair_types: list[str] | None = None,
+ ) -> dict:
+ """Repair low-risk consistency issues and report skipped unsafe issues."""
+ selected_repair_types = self._normalize_consistency_repair_types(repair_types)
+ pre_check = await self.check_consistency()
+
+ repaired: list[dict] = []
+ skipped: list[dict] = []
+ failed: list[dict] = []
+
+ if "orphan_vectors" in selected_repair_types:
+ orphan_vectors = pre_check["issues"].get("orphan_vectors", [])
+ orphan_doc_ids = sorted(
+ {
+ issue.get("doc_id")
+ for issue in orphan_vectors
+ if isinstance(issue.get("doc_id"), str) and issue.get("doc_id")
+ },
+ )
+ for doc_id in orphan_doc_ids:
+ issue_count = sum(
+ 1 for issue in orphan_vectors if issue.get("doc_id") == doc_id
+ )
+ try:
+ await self.vec_db.delete_documents( # type: ignore[attr-defined]
+ metadata_filters={
+ "kb_id": self.kb.kb_id,
+ "kb_doc_id": doc_id,
+ },
+ )
+ repaired.append(
+ {
+ "type": "orphan_vectors",
+ "doc_id": doc_id,
+ "count": issue_count,
+ "action": "deleted_vectors",
+ },
+ )
+ except Exception as exc:
+ failed.append(
+ {
+ "type": "orphan_vectors",
+ "doc_id": doc_id,
+ "count": issue_count,
+ "action": "delete_vectors",
+ "error": str(exc),
+ },
+ )
+
+ if "chunk_count_mismatches" in selected_repair_types:
+ for issue in pre_check["issues"].get("chunk_count_mismatches", []):
+ doc_id = issue.get("doc_id")
+ expected_count = int(issue.get("expected_chunk_count") or 0)
+ actual_count = int(issue.get("actual_chunk_count") or 0)
+ if not isinstance(doc_id, str) or not doc_id:
+ skipped.append(
+ {
+ "type": "chunk_count_mismatches",
+ "reason": "missing_doc_id",
+ "issue": issue,
+ },
+ )
+ continue
+
+ if expected_count > actual_count:
+ skipped.append(
+ {
+ "type": "chunk_count_mismatches",
+ "doc_id": doc_id,
+ "reason": "missing_vectors_require_rebuild",
+ "expected_chunk_count": expected_count,
+ "actual_chunk_count": actual_count,
+ },
+ )
+ continue
+
+ try:
+ await self.refresh_document(doc_id)
+ repaired.append(
+ {
+ "type": "chunk_count_mismatches",
+ "doc_id": doc_id,
+ "action": "refreshed_document_chunk_count",
+ "expected_chunk_count": expected_count,
+ "actual_chunk_count": actual_count,
+ },
+ )
+ except Exception as exc:
+ failed.append(
+ {
+ "type": "chunk_count_mismatches",
+ "doc_id": doc_id,
+ "action": "refresh_document",
+ "expected_chunk_count": expected_count,
+ "actual_chunk_count": actual_count,
+ "error": str(exc),
+ },
+ )
+
+ for issue_type in (
+ "missing_vectors",
+ "missing_source_files",
+ "invalid_vector_metadata",
+ "unsafe_source_paths",
+ ):
+ for issue in pre_check["issues"].get(issue_type, []):
+ skipped.append(
+ {
+ "type": issue_type,
+ "doc_id": issue.get("doc_id"),
+ "chunk_id": issue.get("chunk_id"),
+ "reason": self._get_consistency_repair_skip_reason(
+ issue_type,
+ ),
+ "issue": issue,
+ },
+ )
+
+ if repaired or failed:
+ await self.kb_db.update_kb_stats(
+ kb_id=self.kb.kb_id,
+ vec_db=self.vec_db, # type: ignore
+ )
+ await self.refresh_kb()
+
+ post_check = await self.check_consistency()
+ return {
+ "kb_id": self.kb.kb_id,
+ "kb_name": self.kb.kb_name,
+ "repaired_at": datetime.now(timezone.utc).isoformat(),
+ "repair_types": selected_repair_types,
+ "summary": {
+ "repaired_count": len(repaired),
+ "skipped_count": len(skipped),
+ "failed_count": len(failed),
+ "healthy_after_repair": post_check["summary"]["healthy"],
+ },
+ "actions": {
+ "repaired": repaired,
+ "skipped": skipped,
+ "failed": failed,
+ },
+ "pre_check": pre_check,
+ "post_check": post_check,
+ }
+
+ @staticmethod
+ def _normalize_consistency_repair_types(
+ repair_types: list[str] | None,
+ ) -> list[str]:
+ if repair_types is None:
+ return sorted(CONSISTENCY_REPAIR_TYPES)
+
+ normalized = list(
+ dict.fromkeys(
+ repair_type.strip()
+ for repair_type in repair_types
+ if isinstance(repair_type, str) and repair_type.strip()
+ ),
+ )
+ invalid_types = sorted(set(normalized) - CONSISTENCY_REPAIR_TYPES)
+ if invalid_types:
+ raise ValueError(
+ f"不支持的一致性修复类型: {', '.join(invalid_types)}",
+ )
+ return normalized
+
+ @staticmethod
+ def _get_consistency_repair_skip_reason(issue_type: str) -> str:
+ skip_reasons = {
+ "missing_vectors": "document_rebuild_required",
+ "missing_source_files": "source_file_missing_manual_action_required",
+ "invalid_vector_metadata": "invalid_metadata_manual_action_required",
+ "unsafe_source_paths": "unsafe_source_path_manual_action_required",
+ }
+ return skip_reasons.get(issue_type, "manual_action_required")
+
+ async def _list_all_documents_for_consistency(self) -> list[KBDocument]:
+ return await self._collect_paginated_documents(
+ page_size=CONSISTENCY_CHECK_PAGE_SIZE,
+ )
+
+ async def _list_all_chunks_for_consistency(self) -> list[dict]:
+ return await self._collect_paginated_vector_documents(
+ page_size=CONSISTENCY_CHECK_PAGE_SIZE,
+ unsupported_message="当前知识库存储后端不支持一致性检查",
+ )
+
+ @staticmethod
+ def _parse_stored_chunk_metadata(chunk: dict) -> dict:
+ raw_metadata = chunk.get("metadata")
+ if raw_metadata is None:
+ return {}
+ if isinstance(raw_metadata, dict):
+ return raw_metadata
+ try:
+ metadata = json.loads(raw_metadata)
+ except (TypeError, json.JSONDecodeError) as exc:
+ raise ValueError("invalid metadata JSON") from exc
+ if not isinstance(metadata, dict):
+ raise ValueError("metadata must be a JSON object")
+ return metadata
+
+ @staticmethod
+ def _format_vector_issue(
+ chunk: dict,
+ *,
+ metadata: dict | None = None,
+ metadata_error: str | None = None,
+ ) -> dict:
+ issue = {
+ "chunk_id": chunk.get("doc_id"),
+ "storage_id": chunk.get("id"),
+ }
+ if metadata:
+ issue.update(
+ {
+ "doc_id": metadata.get("kb_doc_id"),
+ "kb_id": metadata.get("kb_id"),
+ "chunk_index": metadata.get("chunk_index"),
+ },
+ )
+ if metadata_error:
+ issue["metadata_error"] = metadata_error
+ return issue
+
+ @staticmethod
+ def _format_document_issue(
+ doc: KBDocument,
+ *,
+ expected_chunk_count: int | None = None,
+ actual_chunk_count: int | None = None,
+ reason: str | None = None,
+ ) -> dict:
+ issue = {
+ "doc_id": doc.doc_id,
+ "doc_name": doc.doc_name,
+ "status": doc.status,
+ "source_type": doc.source_type,
+ "file_path": doc.file_path,
+ }
+ if expected_chunk_count is not None:
+ issue["expected_chunk_count"] = expected_chunk_count
+ if actual_chunk_count is not None:
+ issue["actual_chunk_count"] = actual_chunk_count
+ if reason:
+ issue["reason"] = reason
+ return issue
+
+ def _check_source_file_consistency(
+ self,
+ docs: list[KBDocument],
+ ) -> tuple[list[dict], list[dict], int]:
+ missing_source_files: list[dict] = []
+ unsafe_source_paths: list[dict] = []
+ source_file_count = 0
+ files_root = self.kb_files_dir.resolve(strict=False)
+
+ for doc in docs:
+ if doc.source_type != "file":
+ continue
+
+ if not doc.file_path:
+ if doc.status == "ready":
+ missing_source_files.append(
+ self._format_document_issue(doc, reason="empty_file_path"),
+ )
+ continue
+
+ file_path = Path(doc.file_path).resolve(strict=False)
+ if not file_path.is_relative_to(files_root):
+ unsafe_source_paths.append(
+ self._format_document_issue(
+ doc,
+ reason="outside_kb_files_dir",
+ ),
+ )
+ continue
+ if file_path.exists():
+ source_file_count += 1
+ else:
+ missing_source_files.append(
+ self._format_document_issue(doc, reason="not_found"),
+ )
+
+ return missing_source_files, unsafe_source_paths, source_file_count
+
+ async def _collect_paginated_documents(self, *, page_size: int) -> list[KBDocument]:
+ docs: list[KBDocument] = []
+ offset = 0
+ while True:
+ page = await self.list_documents(
+ offset=offset,
+ limit=page_size,
+ )
+ docs.extend(page)
+ if len(page) < page_size:
+ break
+ offset += page_size
+ return docs
+
+ async def _collect_paginated_vector_documents(
+ self,
+ *,
+ page_size: int,
+ unsupported_message: str,
+ ) -> list[dict]:
+ document_storage = getattr(self.vec_db, "document_storage", None)
+ get_documents = getattr(document_storage, "get_documents", None)
+ if get_documents is None:
+ raise ValueError(unsupported_message)
+
+ chunks: list[dict] = []
+ offset = 0
+ while True:
+ page_result = get_documents(
+ metadata_filters={"kb_id": self.kb.kb_id},
+ offset=offset,
+ limit=page_size,
+ )
+ if not hasattr(page_result, "__await__"):
+ raise ValueError(unsupported_message)
+ page = await page_result
+ chunks.extend(page)
+ if len(page) < page_size:
+ break
+ offset += page_size
+ return chunks
+
async def _save_media(
self,
doc_id: str,
@@ -589,14 +1869,17 @@ async def _save_media(
async def upload_from_url(
self,
url: str,
- chunk_size: int = 512,
- chunk_overlap: int = 50,
- batch_size: int = 32,
- tasks_limit: int = 3,
- max_retries: int = 3,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+ batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE,
+ tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT,
+ max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
progress_callback=None,
enable_cleaning: bool = False,
cleaning_provider_id: str | None = None,
+ parent_doc_id: str | None = None,
+ document_version: int = 1,
+ skip_duplicate_check: bool = False,
) -> KBDocument:
"""从 URL 上传并处理文档(带原子性保证和失败清理)
Args:
@@ -616,52 +1899,100 @@ async def upload_from_url(
ValueError: 如果 URL 为空或无法提取内容
IOError: 如果网络请求失败
"""
- # 获取 Tavily API 密钥
- config = self.prov_mgr.acm.default_conf
- tavily_keys = config.get("provider_settings", {}).get(
- "websearch_tavily_key", []
- )
- if not tavily_keys:
- raise ValueError(
- "Error: Tavily API key is not configured in provider_settings."
+ text_content: str | None = None
+ try:
+ # 获取 Tavily API 密钥
+ config = self.prov_mgr.acm.default_conf
+ tavily_keys = config.get("provider_settings", {}).get(
+ "websearch_tavily_key", []
)
+ if not tavily_keys:
+ raise KnowledgeBaseUploadError(
+ stage="configuration",
+ user_message=(
+ "URL 导入失败:Tavily API key 未配置。"
+ "请先在 provider_settings 中配置 websearch_tavily_key。"
+ ),
+ details={"url": url},
+ )
- # 阶段1: 从 URL 提取内容
- if progress_callback:
- await progress_callback("extracting", 0, 100)
+ # 阶段1: 从 URL 提取内容
+ if progress_callback:
+ await progress_callback("extracting", 0, 100)
- try:
- text_content = await extract_text_from_url(url, tavily_keys)
- except Exception as e:
- logger.error(f"Failed to extract content from URL {url}: {e}")
- raise OSError(f"Failed to extract content from URL {url}: {e}") from e
+ try:
+ text_content = await extract_text_from_url(url, tavily_keys)
+ except KnowledgeBaseUploadError:
+ raise
+ except Exception as e:
+ logger.error(f"Failed to extract content from URL {url}: {e}")
+ raise KnowledgeBaseUploadError(
+ stage="extracting",
+ user_message=(
+ "URL 导入失败:无法提取网页内容。"
+ "请确认 URL 可访问且 Tavily 配置有效。"
+ ),
+ details={"url": url},
+ ) from e
- if not text_content:
- raise ValueError(f"No content extracted from URL: {url}")
+ if not text_content or not text_content.strip():
+ raise KnowledgeBaseUploadError(
+ stage="extracting",
+ user_message=(
+ "URL 导入失败:未能从网页中提取可索引文本。"
+ "请确认页面存在正文内容,或尝试更换 URL。"
+ ),
+ details={"url": url},
+ )
- if progress_callback:
- await progress_callback("extracting", 100, 100)
+ if progress_callback:
+ await progress_callback("extracting", 100, 100)
- # 阶段2: (可选)清洗内容并分块
- final_chunks = await self._clean_and_rechunk_content(
- content=text_content,
- url=url,
- progress_callback=progress_callback,
- enable_cleaning=enable_cleaning,
- cleaning_provider_id=cleaning_provider_id,
- chunk_size=chunk_size,
- chunk_overlap=chunk_overlap,
- )
+ # 阶段2: (可选)清洗内容并分块
+ try:
+ final_chunks = await self._clean_and_rechunk_content(
+ content=text_content,
+ url=url,
+ progress_callback=progress_callback,
+ enable_cleaning=enable_cleaning,
+ cleaning_provider_id=cleaning_provider_id,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+ except KnowledgeBaseUploadError:
+ raise
+ except Exception as e:
+ stage = "cleaning" if enable_cleaning else "chunking"
+ raise KnowledgeBaseUploadError(
+ stage=stage,
+ user_message=(
+ "URL 导入失败:网页内容切分失败。"
+ "请稍后重试,或调整分块参数后再次导入。"
+ ),
+ details={"url": url},
+ ) from e
- if enable_cleaning and not final_chunks:
- raise ValueError(
- "内容清洗后未提取到有效文本。请尝试关闭内容清洗功能,或更换更高性能的LLM模型后重试。"
+ if enable_cleaning and not final_chunks:
+ raise KnowledgeBaseUploadError(
+ stage="cleaning",
+ user_message=(
+ "URL 导入失败:内容清洗后未提取到有效文本。"
+ "请尝试关闭内容清洗功能,或更换更高性能的 LLM 模型后重试。"
+ ),
+ details={"url": url},
+ )
+ except Exception as e:
+ await self._persist_failed_url_document(
+ url=url,
+ text_content=text_content,
+ parent_doc_id=parent_doc_id,
+ document_version=document_version,
+ error=e,
)
+ raise
# 创建一个虚拟文件名
- file_name = url.split("/")[-1] or f"document_from_{url}"
- if not Path(file_name).suffix:
- file_name += ".url"
+ file_name = self._build_url_file_name(url)
# 复用现有的 upload_document 方法,但传入预分块文本
return await self.upload_document(
@@ -675,6 +2006,14 @@ async def upload_from_url(
max_retries=max_retries,
progress_callback=progress_callback,
pre_chunked_text=final_chunks,
+ source_type="url",
+ source_uri=url,
+ source_content_hash=build_content_hash(text_content),
+ source_parser_name=URLExtractor.__name__,
+ source_chunker_name=get_chunker_name(self.chunker),
+ parent_doc_id=parent_doc_id,
+ document_version=document_version,
+ skip_duplicate_check=skip_duplicate_check,
)
async def _clean_and_rechunk_content(
@@ -685,8 +2024,8 @@ async def _clean_and_rechunk_content(
enable_cleaning: bool = False,
cleaning_provider_id: str | None = None,
repair_max_rpm: int = 60,
- chunk_size: int = 512,
- chunk_overlap: int = 50,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
) -> list[str]:
"""
对从 URL 获取的内容进行清洗、修复、翻译和重新分块。
diff --git a/astrbot/core/knowledge_base/kb_mgr.py b/astrbot/core/knowledge_base/kb_mgr.py
index 3285d42c79..dc1dab016e 100644
--- a/astrbot/core/knowledge_base/kb_mgr.py
+++ b/astrbot/core/knowledge_base/kb_mgr.py
@@ -1,22 +1,80 @@
+import asyncio
+import time
from pathlib import Path
+from typing import TYPE_CHECKING
+
+from sqlalchemy import delete
+from sqlmodel import col
from astrbot.core import logger
-from astrbot.core.provider.manager import ProviderManager
from astrbot.core.utils.astrbot_path import get_astrbot_knowledge_base_path
# from .chunking.fixed_size import FixedSizeChunker
+from .capabilities import (
+ DEFAULT_CHUNK_OVERLAP,
+ DEFAULT_CHUNK_SIZE,
+ DEFAULT_INDEX_TYPE,
+ DEFAULT_TOP_K_DENSE,
+ DEFAULT_TOP_K_SPARSE,
+ DEFAULT_TOP_M_FINAL,
+ DEFAULT_UPLOAD_BATCH_SIZE,
+ DEFAULT_UPLOAD_MAX_RETRIES,
+ DEFAULT_UPLOAD_TASKS_LIMIT,
+)
from .chunking.recursive import RecursiveCharacterChunker
from .kb_db_sqlite import KBSQLiteDatabase
from .kb_helper import KBHelper
-from .models import KBDocument, KnowledgeBase
+from .models import (
+ KBDocument,
+ KBMedia,
+ KnowledgeBase,
+)
from .retrieval.manager import RetrievalManager, RetrievalResult
from .retrieval.rank_fusion import RankFusion
from .retrieval.sparse_retriever import SparseRetriever
+if TYPE_CHECKING:
+ from astrbot.core.provider.manager import ProviderManager
+
FILES_PATH = get_astrbot_knowledge_base_path()
DB_PATH = Path(FILES_PATH) / "kb.db"
"""Knowledge Base storage root directory"""
CHUNKER = RecursiveCharacterChunker()
+_UNSET = object()
+INIT_RETRY_COOLDOWN_SECONDS = 60.0
+INIT_RETRY_MAX_ATTEMPTS = 3
+VALID_INDEX_TYPES = {"flat", "hnsw"}
+
+
+def _validate_kb_options(
+ *,
+ chunk_size: int | None,
+ chunk_overlap: int | None,
+ top_k_dense: int | None,
+ top_k_sparse: int | None,
+ top_m_final: int | None,
+ index_type: str | None,
+) -> None:
+ if chunk_size is not None and chunk_size <= 0:
+ raise ValueError("chunk_size 必须大于 0")
+ if chunk_overlap is not None and chunk_overlap < 0:
+ raise ValueError("chunk_overlap 不能为负数")
+ if (
+ chunk_size is not None
+ and chunk_overlap is not None
+ and chunk_overlap >= chunk_size
+ ):
+ raise ValueError("chunk_overlap 必须小于 chunk_size")
+ if top_k_dense is not None and top_k_dense <= 0:
+ raise ValueError("top_k_dense 必须大于 0")
+ if top_k_sparse is not None and top_k_sparse <= 0:
+ raise ValueError("top_k_sparse 必须大于 0")
+ if top_m_final is not None and top_m_final <= 0:
+ raise ValueError("top_m_final 必须大于 0")
+ if index_type is not None and index_type not in VALID_INDEX_TYPES:
+ raise ValueError(
+ f"index_type 必须是 {', '.join(sorted(VALID_INDEX_TYPES))} 之一"
+ )
class KnowledgeBaseManager:
@@ -25,13 +83,86 @@ class KnowledgeBaseManager:
def __init__(
self,
- provider_manager: ProviderManager,
+ provider_manager: "ProviderManager",
) -> None:
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
self.provider_manager = provider_manager
self._session_deleted_callback_registered = False
self.kb_insts: dict[str, KBHelper] = {}
+ self._kb_name_index: dict[str, str] = {}
+ self._kb_instances_lock = asyncio.Lock()
+
+ def _ensure_kb_name_index(self) -> None:
+ if not hasattr(self, "kb_insts"):
+ self.kb_insts = {}
+ if not hasattr(self, "_kb_name_index"):
+ self._kb_name_index = {}
+ known_ids = set(self.kb_insts)
+ self._kb_name_index = {
+ name: kb_id
+ for name, kb_id in self._kb_name_index.items()
+ if kb_id in known_ids
+ }
+ for kb_id, kb_helper in self.kb_insts.items():
+ self._kb_name_index[kb_helper.kb.kb_name] = kb_id
+
+ def _ensure_kb_instances_lock(self) -> asyncio.Lock:
+ if not hasattr(self, "_kb_instances_lock"):
+ self._kb_instances_lock = asyncio.Lock()
+ return self._kb_instances_lock
+
+ def _set_kb_instance(self, kb_id: str, kb_helper: KBHelper) -> None:
+ self._ensure_kb_name_index()
+ self.kb_insts[kb_id] = kb_helper
+ self._kb_name_index = {
+ name: indexed_kb_id
+ for name, indexed_kb_id in self._kb_name_index.items()
+ if indexed_kb_id != kb_id
+ }
+ self._kb_name_index[kb_helper.kb.kb_name] = kb_id
+
+ def _get_kb_unlocked(self, kb_id: str) -> KBHelper | None:
+ if not hasattr(self, "kb_insts"):
+ self.kb_insts = {}
+ return self.kb_insts.get(kb_id)
+
+ def _can_retry_helper_init(self, kb_helper: KBHelper) -> bool:
+ if not kb_helper.init_error:
+ return False
+ retry_count = getattr(kb_helper, "init_retry_count", 0)
+ if retry_count >= INIT_RETRY_MAX_ATTEMPTS:
+ return False
+ last_retry_at = getattr(kb_helper, "last_init_retry_at", 0.0)
+ return time.monotonic() - last_retry_at >= INIT_RETRY_COOLDOWN_SECONDS
+
+ async def _retry_helper_init_if_due(self, kb_helper: KBHelper) -> None:
+ if not self._can_retry_helper_init(kb_helper):
+ return
+
+ kb_helper.init_retry_count = getattr(kb_helper, "init_retry_count", 0) + 1
+ kb_helper.last_init_retry_at = time.monotonic()
+ try:
+ await kb_helper.initialize()
+ kb_helper.init_error = None
+ kb_helper.init_retry_count = 0
+ kb_helper.last_init_retry_at = 0.0
+ except Exception as e:
+ kb_helper.init_error = str(e)
+ logger.warning(
+ f"知识库 {kb_helper.kb.kb_name}({kb_helper.kb.kb_id}) "
+ f"第 {kb_helper.init_retry_count} 次重新初始化失败: {e}",
+ exc_info=True,
+ )
+
+ def _remove_kb_instance(self, kb_id: str) -> None:
+ self._ensure_kb_name_index()
+ self.kb_insts.pop(kb_id, None)
+ self._kb_name_index = {
+ name: indexed_kb_id
+ for name, indexed_kb_id in self._kb_name_index.items()
+ if indexed_kb_id != kb_id
+ }
async def initialize(self) -> None:
"""初始化知识库模块"""
@@ -76,11 +207,13 @@ async def load_kbs(self) -> None:
await kb_helper.initialize()
except Exception as e:
kb_helper.init_error = str(e)
+ kb_helper.init_retry_count = 0
+ kb_helper.last_init_retry_at = time.monotonic()
logger.error(
f"知识库 {record.kb_name}({record.kb_id}) 初始化失败: {e}",
exc_info=True,
)
- self.kb_insts[record.kb_id] = kb_helper
+ self._set_kb_instance(record.kb_id, kb_helper)
async def create_kb(
self,
@@ -94,206 +227,335 @@ async def create_kb(
top_k_dense: int | None = None,
top_k_sparse: int | None = None,
top_m_final: int | None = None,
+ index_type: str | None = None,
) -> KBHelper:
"""创建新的知识库实例"""
if embedding_provider_id is None:
raise ValueError("创建知识库时必须提供embedding_provider_id")
+ effective_chunk_size = (
+ chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE
+ )
+ effective_chunk_overlap = (
+ chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
+ )
+ effective_top_k_dense = (
+ top_k_dense if top_k_dense is not None else DEFAULT_TOP_K_DENSE
+ )
+ effective_top_k_sparse = (
+ top_k_sparse if top_k_sparse is not None else DEFAULT_TOP_K_SPARSE
+ )
+ effective_top_m_final = (
+ top_m_final if top_m_final is not None else DEFAULT_TOP_M_FINAL
+ )
+ effective_index_type = (
+ index_type if index_type is not None else DEFAULT_INDEX_TYPE
+ )
+ _validate_kb_options(
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ top_k_dense=effective_top_k_dense,
+ top_k_sparse=effective_top_k_sparse,
+ top_m_final=effective_top_m_final,
+ index_type=effective_index_type,
+ )
kb = KnowledgeBase(
kb_name=kb_name,
description=description,
emoji=emoji or "📚",
embedding_provider_id=embedding_provider_id,
rerank_provider_id=rerank_provider_id,
- chunk_size=chunk_size if chunk_size is not None else 512,
- chunk_overlap=chunk_overlap if chunk_overlap is not None else 50,
- top_k_dense=top_k_dense if top_k_dense is not None else 50,
- top_k_sparse=top_k_sparse if top_k_sparse is not None else 50,
- top_m_final=top_m_final if top_m_final is not None else 5,
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ top_k_dense=effective_top_k_dense,
+ top_k_sparse=effective_top_k_sparse,
+ top_m_final=effective_top_m_final,
+ index_type=effective_index_type,
)
+ kb_helper: KBHelper | None = None
try:
- async with self.kb_db.get_db() as session:
- session.add(kb)
- await session.flush()
-
- kb_helper = KBHelper(
- kb_db=self.kb_db,
- kb=kb,
- provider_manager=self.provider_manager,
- kb_root_dir=FILES_PATH,
- chunker=CHUNKER,
- )
- await kb_helper.initialize()
- await session.commit()
- self.kb_insts[kb.kb_id] = kb_helper
- return kb_helper
+ async with self._ensure_kb_instances_lock():
+ async with self.kb_db.get_db() as session:
+ session.add(kb)
+ await session.flush()
+
+ kb_helper = KBHelper(
+ kb_db=self.kb_db,
+ kb=kb,
+ provider_manager=self.provider_manager,
+ kb_root_dir=FILES_PATH,
+ chunker=CHUNKER,
+ )
+ await kb_helper.initialize()
+ await session.commit()
+ self._set_kb_instance(kb.kb_id, kb_helper)
+ return kb_helper
except Exception as e:
+ if kb_helper is not None:
+ try:
+ await kb_helper.delete_vec_db()
+ except Exception as cleanup_err:
+ logger.warning(
+ f"创建知识库 {kb_name} 失败后清理文件目录失败: {cleanup_err}",
+ )
if "kb_name" in str(e):
raise ValueError(f"知识库名称 '{kb_name}' 已存在")
raise
async def get_kb(self, kb_id: str) -> KBHelper | None:
"""获取知识库实例"""
- if kb_id in self.kb_insts:
- return self.kb_insts[kb_id]
+ async with self._ensure_kb_instances_lock():
+ kb_helper = self._get_kb_unlocked(kb_id)
+ if kb_helper is not None:
+ await self._retry_helper_init_if_due(kb_helper)
+ return kb_helper
async def get_kb_by_name(self, kb_name: str) -> KBHelper | None:
"""通过名称获取知识库实例"""
- for kb_helper in self.kb_insts.values():
- if kb_helper.kb.kb_name == kb_name:
- return kb_helper
- return None
+ async with self._ensure_kb_instances_lock():
+ self._ensure_kb_name_index()
+ kb_id = self._kb_name_index.get(kb_name)
+ if kb_id:
+ return self.kb_insts.get(kb_id)
+ return None
async def delete_kb(self, kb_id: str) -> bool:
"""删除知识库实例"""
- kb_helper = await self.get_kb(kb_id)
- if not kb_helper:
- return False
+ async with self._ensure_kb_instances_lock():
+ kb_helper = self._get_kb_unlocked(kb_id)
+ if not kb_helper:
+ return False
- await kb_helper.delete_vec_db()
- async with self.kb_db.get_db() as session:
- await session.delete(kb_helper.kb)
- await session.commit()
+ async with self.kb_db.get_db() as session:
+ await session.execute(
+ delete(KBMedia).where(col(KBMedia.kb_id) == kb_id)
+ )
+ await session.execute(
+ delete(KBDocument).where(col(KBDocument.kb_id) == kb_id)
+ )
+ await session.execute(
+ delete(KnowledgeBase).where(col(KnowledgeBase.kb_id) == kb_id)
+ )
+ await session.commit()
- self.kb_insts.pop(kb_id, None)
- return True
+ try:
+ await kb_helper.delete_vec_db()
+ except Exception as e:
+ logger.warning(
+ f"知识库 {kb_id} 数据库记录已删除,但文件目录清理失败: {e}"
+ )
+
+ self._remove_kb_instance(kb_id)
+ return True
async def list_kbs(self) -> list[KnowledgeBase]:
"""列出所有知识库实例"""
- kbs = [kb_helper.kb for kb_helper in self.kb_insts.values()]
- return kbs
+ async with self._ensure_kb_instances_lock():
+ kbs = [kb_helper.kb for kb_helper in self.kb_insts.values()]
+ return kbs
async def update_kb(
self,
kb_id: str,
- kb_name: str,
+ kb_name: str | None = None,
description: str | None = None,
emoji: str | None = None,
embedding_provider_id: str | None = None,
- rerank_provider_id: str | None = None,
+ rerank_provider_id: str | None | object = _UNSET,
chunk_size: int | None = None,
chunk_overlap: int | None = None,
top_k_dense: int | None = None,
top_k_sparse: int | None = None,
top_m_final: int | None = None,
+ index_type: str | None = None,
) -> KBHelper | None:
"""更新知识库实例"""
- kb_helper = await self.get_kb(kb_id)
- if not kb_helper:
- return None
-
- kb = kb_helper.kb
- previous_state = {
- "kb_name": kb.kb_name,
- "description": kb.description,
- "emoji": kb.emoji,
- "embedding_provider_id": kb.embedding_provider_id,
- "rerank_provider_id": kb.rerank_provider_id,
- "chunk_size": kb.chunk_size,
- "chunk_overlap": kb.chunk_overlap,
- "top_k_dense": kb.top_k_dense,
- "top_k_sparse": kb.top_k_sparse,
- "top_m_final": kb.top_m_final,
- }
- previous_init_error = kb_helper.init_error
-
- if kb_name is not None:
- kb.kb_name = kb_name
- if description is not None:
- kb.description = description
- if emoji is not None:
- kb.emoji = emoji
- if embedding_provider_id is not None:
- kb.embedding_provider_id = embedding_provider_id
- kb.rerank_provider_id = rerank_provider_id # 允许设置为 None
- if chunk_size is not None:
- kb.chunk_size = chunk_size
- if chunk_overlap is not None:
- kb.chunk_overlap = chunk_overlap
- if top_k_dense is not None:
- kb.top_k_dense = top_k_dense
- if top_k_sparse is not None:
- kb.top_k_sparse = top_k_sparse
- if top_m_final is not None:
- kb.top_m_final = top_m_final
-
- # Build a new helper first. Keep current vec_db alive until new init succeeds.
- new_helper = KBHelper(
- kb_db=self.kb_db,
- kb=kb,
- provider_manager=self.provider_manager,
- kb_root_dir=FILES_PATH,
- chunker=CHUNKER,
- )
-
- try:
- await new_helper.initialize()
- except Exception as e:
- # Roll back in-memory settings and keep current helper available.
- kb.kb_name = previous_state["kb_name"]
- kb.description = previous_state["description"]
- kb.emoji = previous_state["emoji"]
- kb.embedding_provider_id = previous_state["embedding_provider_id"]
- kb.rerank_provider_id = previous_state["rerank_provider_id"]
- kb.chunk_size = previous_state["chunk_size"]
- kb.chunk_overlap = previous_state["chunk_overlap"]
- kb.top_k_dense = previous_state["top_k_dense"]
- kb.top_k_sparse = previous_state["top_k_sparse"]
- kb.top_m_final = previous_state["top_m_final"]
- kb_helper.init_error = previous_init_error
- logger.error(
- f"知识库 {kb.kb_name}({kb.kb_id}) 重新初始化失败,继续使用旧实例: {e}",
- exc_info=True,
+ async with self._ensure_kb_instances_lock():
+ kb_helper = self._get_kb_unlocked(kb_id)
+ if not kb_helper:
+ return None
+
+ kb = kb_helper.kb
+ previous_state = {
+ "kb_name": kb.kb_name,
+ "description": kb.description,
+ "emoji": kb.emoji,
+ "embedding_provider_id": kb.embedding_provider_id,
+ "rerank_provider_id": kb.rerank_provider_id,
+ "chunk_size": kb.chunk_size,
+ "chunk_overlap": kb.chunk_overlap,
+ "top_k_dense": kb.top_k_dense,
+ "top_k_sparse": kb.top_k_sparse,
+ "top_m_final": kb.top_m_final,
+ "index_type": kb.index_type,
+ }
+ previous_init_error = kb_helper.init_error
+
+ candidate_state = previous_state.copy()
+ if kb_name is not None:
+ candidate_state["kb_name"] = kb_name
+ if description is not None:
+ candidate_state["description"] = description
+ if emoji is not None:
+ candidate_state["emoji"] = emoji
+ if embedding_provider_id is not None:
+ candidate_state["embedding_provider_id"] = embedding_provider_id
+ if rerank_provider_id is not _UNSET:
+ candidate_state["rerank_provider_id"] = rerank_provider_id
+ if chunk_size is not None:
+ candidate_state["chunk_size"] = chunk_size
+ if chunk_overlap is not None:
+ candidate_state["chunk_overlap"] = chunk_overlap
+ if top_k_dense is not None:
+ candidate_state["top_k_dense"] = top_k_dense
+ if top_k_sparse is not None:
+ candidate_state["top_k_sparse"] = top_k_sparse
+ if top_m_final is not None:
+ candidate_state["top_m_final"] = top_m_final
+ if index_type is not None:
+ candidate_state["index_type"] = index_type
+ _validate_kb_options(
+ chunk_size=candidate_state["chunk_size"],
+ chunk_overlap=candidate_state["chunk_overlap"],
+ top_k_dense=candidate_state["top_k_dense"],
+ top_k_sparse=candidate_state["top_k_sparse"],
+ top_m_final=candidate_state["top_m_final"],
+ index_type=candidate_state["index_type"],
+ )
+ kb.kb_name = candidate_state["kb_name"]
+ kb.description = candidate_state["description"]
+ kb.emoji = candidate_state["emoji"]
+ kb.embedding_provider_id = candidate_state["embedding_provider_id"]
+ kb.rerank_provider_id = candidate_state["rerank_provider_id"] # type: ignore[assignment]
+ kb.chunk_size = candidate_state["chunk_size"]
+ kb.chunk_overlap = candidate_state["chunk_overlap"]
+ kb.top_k_dense = candidate_state["top_k_dense"]
+ kb.top_k_sparse = candidate_state["top_k_sparse"]
+ kb.top_m_final = candidate_state["top_m_final"]
+ kb.index_type = candidate_state["index_type"]
+
+ # Build a new helper first. Keep current vec_db alive until new init succeeds.
+ new_helper = KBHelper(
+ kb_db=self.kb_db,
+ kb=kb,
+ provider_manager=self.provider_manager,
+ kb_root_dir=FILES_PATH,
+ chunker=CHUNKER,
)
- return kb_helper
- async with self.kb_db.get_db() as session:
- session.add(kb)
- await session.commit()
- await session.refresh(kb)
+ try:
+ await new_helper.initialize()
+ except Exception as e:
+ # Roll back in-memory settings and keep current helper available.
+ kb.kb_name = previous_state["kb_name"]
+ kb.description = previous_state["description"]
+ kb.emoji = previous_state["emoji"]
+ kb.embedding_provider_id = previous_state["embedding_provider_id"]
+ kb.rerank_provider_id = previous_state["rerank_provider_id"]
+ kb.chunk_size = previous_state["chunk_size"]
+ kb.chunk_overlap = previous_state["chunk_overlap"]
+ kb.top_k_dense = previous_state["top_k_dense"]
+ kb.top_k_sparse = previous_state["top_k_sparse"]
+ kb.top_m_final = previous_state["top_m_final"]
+ kb.index_type = previous_state["index_type"]
+ kb_helper.init_error = previous_init_error
+ logger.error(
+ f"知识库 {kb.kb_name}({kb.kb_id}) 重新初始化失败,继续使用旧实例: {e}",
+ exc_info=True,
+ )
+ return kb_helper
+
+ async with self.kb_db.get_db() as session:
+ session.add(kb)
+ await session.commit()
+ await session.refresh(kb)
- old_helper = kb_helper
- self.kb_insts[kb_id] = new_helper
- await old_helper.terminate()
- new_helper.init_error = None
- return new_helper
+ old_helper = kb_helper
+ self._set_kb_instance(kb_id, new_helper)
+ await old_helper.terminate()
+ new_helper.init_error = None
+ return new_helper
async def retrieve(
self,
query: str,
- kb_names: list[str],
+ kb_names: list[str] | None = None,
+ kb_ids: list[str] | None = None,
top_k_fusion: int = 20,
- top_m_final: int = 5,
+ top_m_final: int = DEFAULT_TOP_M_FINAL,
+ include_trace: bool = False,
+ retrieval_overrides: dict | None = None,
) -> dict | None:
"""从指定知识库中检索相关内容"""
- kb_ids = []
+ resolved_kb_ids = []
kb_id_helper_map = {}
unavailable_kbs = []
- for kb_name in kb_names:
- if kb_helper := await self.get_kb_by_name(kb_name):
- if kb_helper.init_error:
- unavailable_kbs.append((kb_name, kb_helper.init_error))
- logger.warning(f"知识库 {kb_name} 不可用: {kb_helper.init_error}")
- continue
- kb_ids.append(kb_helper.kb.kb_id)
- kb_id_helper_map[kb_helper.kb.kb_id] = kb_helper
+ if kb_ids:
+ for kb_id in kb_ids:
+ if kb_helper := await self.get_kb(kb_id):
+ if kb_helper.init_error:
+ unavailable_kbs.append((kb_id, kb_helper.init_error))
+ logger.warning(f"知识库 {kb_id} 不可用: {kb_helper.init_error}")
+ continue
+ resolved_kb_ids.append(kb_helper.kb.kb_id)
+ kb_id_helper_map[kb_helper.kb.kb_id] = kb_helper
+ elif kb_names:
+ for kb_name in kb_names:
+ if kb_helper := await self.get_kb_by_name(kb_name):
+ if kb_helper.init_error:
+ unavailable_kbs.append((kb_name, kb_helper.init_error))
+ logger.warning(
+ f"知识库 {kb_name} 不可用: {kb_helper.init_error}",
+ )
+ continue
+ resolved_kb_ids.append(kb_helper.kb.kb_id)
+ kb_id_helper_map[kb_helper.kb.kb_id] = kb_helper
+ else:
+ return {}
# all requested KBs are unavailable
- if not kb_ids and unavailable_kbs:
+ if not resolved_kb_ids and unavailable_kbs:
errors = "; ".join(f"{n}: {e}" for n, e in unavailable_kbs)
raise ValueError(f"所有请求的知识库均不可用: {errors}")
- if not kb_ids:
+ if not resolved_kb_ids:
return {}
- results = await self.retrieval_manager.retrieve(
- query=query,
- kb_ids=kb_ids,
- kb_id_helper_map=kb_id_helper_map,
- top_k_fusion=top_k_fusion,
- top_m_final=top_m_final,
- )
+ trace_payload = None
+ if include_trace:
+ retrieval_response = await self.retrieval_manager.retrieve_with_trace(
+ query=query,
+ kb_ids=resolved_kb_ids,
+ kb_id_helper_map=kb_id_helper_map,
+ top_k_fusion=top_k_fusion,
+ top_m_final=top_m_final,
+ retrieval_overrides=retrieval_overrides,
+ )
+ results = retrieval_response.results
+ trace_payload = retrieval_response.trace.to_dict()
+ else:
+ results = await self.retrieval_manager.retrieve(
+ query=query,
+ kb_ids=resolved_kb_ids,
+ kb_id_helper_map=kb_id_helper_map,
+ top_k_fusion=top_k_fusion,
+ top_m_final=top_m_final,
+ retrieval_overrides=retrieval_overrides,
+ )
if not results:
- return None
+ empty_response = {
+ "context_text": "",
+ "results": [],
+ }
+ if include_trace:
+ empty_response["trace"] = trace_payload or {
+ "dense": [],
+ "sparse": [],
+ "fusion": [],
+ "dedup": [],
+ "rerank": [],
+ "final": [],
+ }
+ return empty_response if include_trace else None
context_text = self._format_context(results)
@@ -305,6 +567,7 @@ async def retrieve(
"kb_name": r.kb_name,
"doc_name": r.doc_name,
"chunk_index": r.metadata.get("chunk_index", 0),
+ "source": self._format_result_source(r),
"content": r.content,
"score": r.score,
"char_count": r.metadata.get("char_count", 0),
@@ -312,10 +575,40 @@ async def retrieve(
for r in results
]
- return {
+ response = {
"context_text": context_text,
"results": results_dict,
}
+ if include_trace:
+ response["trace"] = trace_payload
+ return response
+
+ def _format_result_source(self, result: RetrievalResult) -> dict:
+ return {
+ "kb_name": result.kb_name,
+ "document_name": result.doc_name,
+ "chunk_index": result.metadata.get("chunk_index", 0),
+ "section_index": result.metadata.get("section_index"),
+ "title_path": result.metadata.get("title_path"),
+ "page_number": result.metadata.get("page_number"),
+ "parent_chunk_id": result.metadata.get("parent_chunk_id"),
+ }
+
+ def _format_source_label(self, result: RetrievalResult) -> str:
+ source = self._format_result_source(result)
+ details = []
+ title_path = source.get("title_path")
+ if isinstance(title_path, list) and title_path:
+ details.append(" > ".join(str(title) for title in title_path))
+ if source.get("page_number") is not None:
+ details.append(f"第 {source['page_number']} 页")
+ if source.get("section_index") is not None:
+ details.append(f"章节 {source['section_index']}")
+
+ base = f"{result.kb_name} / {result.doc_name}"
+ if details:
+ return f"{base} ({'; '.join(details)})"
+ return base
def _format_context(self, results: list[RetrievalResult]) -> str:
"""格式化知识上下文
@@ -331,7 +624,7 @@ def _format_context(self, results: list[RetrievalResult]) -> str:
for i, result in enumerate(results, 1):
lines.append(f"【知识 {i}】")
- lines.append(f"来源: {result.kb_name} / {result.doc_name}")
+ lines.append(f"来源: {self._format_source_label(result)}")
lines.append(f"内容: {result.content}")
lines.append(f"相关度: {result.score:.2f}")
lines.append("")
@@ -359,11 +652,11 @@ async def upload_from_url(
self,
kb_id: str,
url: str,
- chunk_size: int = 512,
- chunk_overlap: int = 50,
- batch_size: int = 32,
- tasks_limit: int = 3,
- max_retries: int = 3,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+ batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE,
+ tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT,
+ max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES,
progress_callback=None,
) -> KBDocument:
"""从 URL 上传文档到指定的知识库
diff --git a/astrbot/core/knowledge_base/models.py b/astrbot/core/knowledge_base/models.py
index da919a384a..cd0e8290f0 100644
--- a/astrbot/core/knowledge_base/models.py
+++ b/astrbot/core/knowledge_base/models.py
@@ -3,6 +3,15 @@
from sqlmodel import Field, MetaData, SQLModel, Text, UniqueConstraint
+from .capabilities import (
+ DEFAULT_CHUNK_OVERLAP,
+ DEFAULT_CHUNK_SIZE,
+ DEFAULT_INDEX_TYPE,
+ DEFAULT_TOP_K_DENSE,
+ DEFAULT_TOP_K_SPARSE,
+ DEFAULT_TOP_M_FINAL,
+)
+
class BaseKBModel(SQLModel, table=False):
metadata = MetaData()
@@ -34,12 +43,14 @@ class KnowledgeBase(BaseKBModel, table=True):
embedding_provider_id: str | None = Field(default=None, max_length=100)
rerank_provider_id: str | None = Field(default=None, max_length=100)
# 分块配置参数
- chunk_size: int | None = Field(default=512, nullable=True)
- chunk_overlap: int | None = Field(default=50, nullable=True)
+ chunk_size: int | None = Field(default=DEFAULT_CHUNK_SIZE, nullable=True)
+ chunk_overlap: int | None = Field(default=DEFAULT_CHUNK_OVERLAP, nullable=True)
+ # 索引类型: "flat" (精确) 或 "hnsw" (近似最近邻,适合大规模)
+ index_type: str | None = Field(default=DEFAULT_INDEX_TYPE, max_length=10)
# 检索配置参数
- top_k_dense: int | None = Field(default=50, nullable=True)
- top_k_sparse: int | None = Field(default=50, nullable=True)
- top_m_final: int | None = Field(default=5, nullable=True)
+ top_k_dense: int | None = Field(default=DEFAULT_TOP_K_DENSE, nullable=True)
+ top_k_sparse: int | None = Field(default=DEFAULT_TOP_K_SPARSE, nullable=True)
+ top_m_final: int | None = Field(default=DEFAULT_TOP_M_FINAL, nullable=True)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
@@ -81,6 +92,18 @@ class KBDocument(BaseKBModel, table=True):
file_type: str = Field(max_length=20, nullable=False)
file_size: int = Field(nullable=False)
file_path: str = Field(max_length=512, nullable=False)
+ source_type: str = Field(default="file", max_length=20, nullable=False)
+ source_uri: str | None = Field(default=None, sa_type=Text)
+ content_hash: str | None = Field(default=None, max_length=64, index=True)
+ parser_name: str | None = Field(default=None, max_length=100)
+ parser_version: str | None = Field(default=None, max_length=50)
+ chunker_name: str | None = Field(default=None, max_length=100)
+ chunker_version: str | None = Field(default=None, max_length=50)
+ status: str = Field(default="ready", max_length=20, nullable=False, index=True)
+ error_stage: str | None = Field(default=None, max_length=50)
+ error_message: str | None = Field(default=None, sa_type=Text)
+ version: int = Field(default=1, nullable=False)
+ parent_doc_id: str | None = Field(default=None, max_length=36, index=True)
chunk_count: int = Field(default=0, nullable=False)
media_count: int = Field(default=0, nullable=False)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
@@ -88,6 +111,7 @@ class KBDocument(BaseKBModel, table=True):
default_factory=lambda: datetime.now(timezone.utc),
sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
)
+ indexed_at: datetime | None = Field(default=None)
class KBMedia(BaseKBModel, table=True):
@@ -118,3 +142,36 @@ class KBMedia(BaseKBModel, table=True):
file_size: int = Field(nullable=False)
mime_type: str = Field(max_length=100, nullable=False)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+
+
+class KBIngestionTask(BaseKBModel, table=True):
+ """Persistent knowledge-base ingestion task state."""
+
+ __tablename__ = "kb_ingestion_tasks" # type: ignore
+
+ id: int | None = Field(
+ primary_key=True,
+ sa_column_kwargs={"autoincrement": True},
+ default=None,
+ )
+ task_id: str = Field(
+ max_length=36,
+ nullable=False,
+ unique=True,
+ default_factory=lambda: str(uuid.uuid4()),
+ index=True,
+ )
+ kb_id: str = Field(max_length=36, nullable=False, index=True)
+ task_type: str = Field(max_length=30, nullable=False, index=True)
+ status: str = Field(default="pending", max_length=20, nullable=False, index=True)
+ progress_stage: str | None = Field(default=None, max_length=50)
+ progress_current: int = Field(default=0, nullable=False)
+ progress_total: int = Field(default=100, nullable=False)
+ progress: str | None = Field(default=None, sa_type=Text)
+ result: str | None = Field(default=None, sa_type=Text)
+ error: str | None = Field(default=None, sa_type=Text)
+ created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+ updated_at: datetime = Field(
+ default_factory=lambda: datetime.now(timezone.utc),
+ sa_column_kwargs={"onupdate": datetime.now(timezone.utc)},
+ )
diff --git a/astrbot/core/knowledge_base/parsers/base.py b/astrbot/core/knowledge_base/parsers/base.py
index 4ffca9c6f2..c204adcfeb 100644
--- a/astrbot/core/knowledge_base/parsers/base.py
+++ b/astrbot/core/knowledge_base/parsers/base.py
@@ -20,6 +20,14 @@ class MediaItem:
mime_type: str
+@dataclass
+class TextSegment:
+ """Parsed text segment with optional source location metadata."""
+
+ text: str
+ metadata: dict
+
+
@dataclass
class ParseResult:
"""解析结果
@@ -29,6 +37,7 @@ class ParseResult:
text: str
media: list[MediaItem]
+ text_segments: list[TextSegment] | None = None
class BaseParser(ABC):
diff --git a/astrbot/core/knowledge_base/parsers/pdf_parser.py b/astrbot/core/knowledge_base/parsers/pdf_parser.py
index aeeea930a2..811341f25c 100644
--- a/astrbot/core/knowledge_base/parsers/pdf_parser.py
+++ b/astrbot/core/knowledge_base/parsers/pdf_parser.py
@@ -11,6 +11,7 @@
BaseParser,
MediaItem,
ParseResult,
+ TextSegment,
)
@@ -35,13 +36,20 @@ async def parse(self, file_content: bytes, file_name: str) -> ParseResult:
reader = PdfReader(pdf_file)
text_parts = []
+ text_segments = []
media_items = []
# 提取文本
- for page in reader.pages:
+ for page_number, page in enumerate(reader.pages, start=1):
text = page.extract_text()
if text:
text_parts.append(text)
+ text_segments.append(
+ TextSegment(
+ text=text,
+ metadata={"page_number": page_number},
+ )
+ )
# 提取图片
image_counter = 0
@@ -98,4 +106,8 @@ async def parse(self, file_content: bytes, file_name: str) -> ParseResult:
continue
full_text = "\n\n".join(text_parts)
- return ParseResult(text=full_text, media=media_items)
+ return ParseResult(
+ text=full_text,
+ media=media_items,
+ text_segments=text_segments,
+ )
diff --git a/astrbot/core/knowledge_base/retrieval/__init__.py b/astrbot/core/knowledge_base/retrieval/__init__.py
index b7c88075d5..26508c31f2 100644
--- a/astrbot/core/knowledge_base/retrieval/__init__.py
+++ b/astrbot/core/knowledge_base/retrieval/__init__.py
@@ -3,7 +3,12 @@
from typing import TYPE_CHECKING
if TYPE_CHECKING:
- from .manager import RetrievalManager, RetrievalResult
+ from .manager import (
+ RetrievalManager,
+ RetrievalResult,
+ RetrievalTrace,
+ RetrievalWithTrace,
+ )
from .rank_fusion import FusedResult, RankFusion
from .sparse_retriever import SparseResult, SparseRetriever
@@ -12,18 +17,32 @@
"RankFusion",
"RetrievalManager",
"RetrievalResult",
+ "RetrievalTrace",
+ "RetrievalWithTrace",
"SparseResult",
"SparseRetriever",
]
def __getattr__(name: str):
- if name in {"RetrievalManager", "RetrievalResult"}:
- from .manager import RetrievalManager, RetrievalResult
+ if name in {
+ "RetrievalManager",
+ "RetrievalResult",
+ "RetrievalTrace",
+ "RetrievalWithTrace",
+ }:
+ from .manager import (
+ RetrievalManager,
+ RetrievalResult,
+ RetrievalTrace,
+ RetrievalWithTrace,
+ )
return {
"RetrievalManager": RetrievalManager,
"RetrievalResult": RetrievalResult,
+ "RetrievalTrace": RetrievalTrace,
+ "RetrievalWithTrace": RetrievalWithTrace,
}[name]
if name in {"FusedResult", "RankFusion"}:
diff --git a/astrbot/core/knowledge_base/retrieval/manager.py b/astrbot/core/knowledge_base/retrieval/manager.py
index 1d65401ce5..dbb5a483c9 100644
--- a/astrbot/core/knowledge_base/retrieval/manager.py
+++ b/astrbot/core/knowledge_base/retrieval/manager.py
@@ -3,14 +3,20 @@
协调稠密检索、稀疏检索和 Rerank,提供统一的检索接口
"""
+import json
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING
from astrbot import logger
from astrbot.core.db.vec_db.base import Result
+from astrbot.core.knowledge_base.capabilities import (
+ DEFAULT_TOP_K_DENSE,
+ DEFAULT_TOP_K_SPARSE,
+ DEFAULT_TOP_M_FINAL,
+)
from astrbot.core.knowledge_base.kb_db_sqlite import KBSQLiteDatabase
-from astrbot.core.knowledge_base.retrieval.rank_fusion import RankFusion
+from astrbot.core.knowledge_base.retrieval.rank_fusion import FusedResult, RankFusion
from astrbot.core.knowledge_base.retrieval.sparse_retriever import SparseRetriever
from astrbot.core.provider.provider import RerankProvider
@@ -20,6 +26,13 @@
from astrbot.core.db.vec_db.faiss_impl import FaissVecDB
+RetrievalOverrideValue = int | str | None
+RetrievalOverrides = dict[str, RetrievalOverrideValue]
+
+DEDUP_SHINGLE_SIZE = 5
+DEDUP_JACCARD_THRESHOLD = 0.92
+
+
@dataclass
class RetrievalResult:
"""检索结果"""
@@ -34,6 +47,38 @@ class RetrievalResult:
metadata: dict
+@dataclass
+class RetrievalTrace:
+ """Detailed retrieval pipeline trace for diagnostics."""
+
+ dense: list[dict]
+ sparse: list[dict]
+ fusion: list[dict]
+ dedup: list[dict]
+ dedup_removed: list[dict]
+ rerank: list[dict]
+ final: list[dict]
+
+ def to_dict(self) -> dict:
+ return {
+ "dense": self.dense,
+ "sparse": self.sparse,
+ "fusion": self.fusion,
+ "dedup": self.dedup,
+ "dedup_removed": self.dedup_removed,
+ "rerank": self.rerank,
+ "final": self.final,
+ }
+
+
+@dataclass
+class RetrievalWithTrace:
+ """Retrieval results with optional pipeline diagnostics."""
+
+ results: list[RetrievalResult]
+ trace: RetrievalTrace
+
+
class RetrievalManager:
"""检索管理器
@@ -67,7 +112,8 @@ async def retrieve(
kb_ids: list[str],
kb_id_helper_map: dict[str, KBHelper],
top_k_fusion: int = 20,
- top_m_final: int = 5,
+ top_m_final: int = DEFAULT_TOP_M_FINAL,
+ retrieval_overrides: RetrievalOverrides | None = None,
) -> list[RetrievalResult]:
"""混合检索
@@ -90,24 +136,11 @@ async def retrieve(
if not kb_ids:
return []
- kb_options: dict = {}
- new_kb_ids = []
- for kb_id in kb_ids:
- kb_helper = kb_id_helper_map.get(kb_id)
- if kb_helper:
- kb = kb_helper.kb
- kb_options[kb_id] = {
- "top_k_dense": kb.top_k_dense or 50,
- "top_k_sparse": kb.top_k_sparse or 50,
- "top_m_final": kb.top_m_final or 5,
- "vec_db": kb_helper.vec_db,
- "rerank_provider_id": kb.rerank_provider_id,
- }
- new_kb_ids.append(kb_id)
- else:
- logger.warning(f"知识库 ID {kb_id} 实例未找到, 已跳过该知识库的检索")
-
- kb_ids = new_kb_ids
+ kb_ids, kb_options = self._build_kb_options(
+ kb_ids,
+ kb_id_helper_map,
+ retrieval_overrides=retrieval_overrides,
+ )
# 1. 稠密检索
time_start = time.time()
@@ -140,15 +173,302 @@ async def retrieve(
sparse_results=sparse_results,
top_k=top_k_fusion,
)
+ deduped_results = self._deduplicate_fused_results(fused_results)
time_end = time.time()
logger.debug(
- f"Rank fusion took {time_end - time_start:.2f}s and returned {len(fused_results)} results.",
+ f"Rank fusion took {time_end - time_start:.2f}s and returned "
+ f"{len(fused_results)} results; dedup kept {len(deduped_results)}.",
)
# 4. 转换为 RetrievalResult (批量获取元数据)
- doc_ids = {fr.doc_id for fr in fused_results}
+ doc_ids = {fr.doc_id for fr in deduped_results}
+ metadata_map = await self.kb_db.get_documents_with_metadata_batch(doc_ids)
+ retrieval_results = self._build_retrieval_results(
+ fused_results=deduped_results,
+ metadata_map=metadata_map,
+ )
+
+ # 5. Rerank
+ first_rerank = self._get_first_rerank_provider(kb_ids, kb_options)
+ if first_rerank and retrieval_results:
+ try:
+ retrieval_results = await self._rerank(
+ query=query,
+ results=retrieval_results,
+ top_k=top_m_final,
+ rerank_provider=first_rerank,
+ )
+ except Exception as e:
+ logger.warning(f"Rerank 执行失败,已跳过重排序并使用融合结果: {e}")
+
+ return retrieval_results[:top_m_final]
+
+ async def retrieve_with_trace(
+ self,
+ query: str,
+ kb_ids: list[str],
+ kb_id_helper_map: dict[str, KBHelper],
+ top_k_fusion: int = 20,
+ top_m_final: int = DEFAULT_TOP_M_FINAL,
+ retrieval_overrides: RetrievalOverrides | None = None,
+ ) -> RetrievalWithTrace:
+ """Hybrid retrieval with detailed stage diagnostics."""
+ if not kb_ids:
+ return RetrievalWithTrace(
+ results=[],
+ trace=RetrievalTrace(
+ dense=[],
+ sparse=[],
+ fusion=[],
+ dedup=[],
+ dedup_removed=[],
+ rerank=[],
+ final=[],
+ ),
+ )
+
+ kb_ids, kb_options = self._build_kb_options(
+ kb_ids,
+ kb_id_helper_map,
+ retrieval_overrides=retrieval_overrides,
+ )
+
+ dense_results = await self._dense_retrieve(
+ query=query,
+ kb_ids=kb_ids,
+ kb_options=kb_options,
+ )
+ sparse_results = await self.sparse_retriever.retrieve(
+ query=query,
+ kb_ids=kb_ids,
+ kb_options=kb_options,
+ )
+ fused_results = await self.rank_fusion.fuse(
+ dense_results=dense_results,
+ sparse_results=sparse_results,
+ top_k=top_k_fusion,
+ )
+ deduped_results, dedup_removed_results = (
+ self._deduplicate_fused_results_with_trace(
+ fused_results,
+ )
+ )
+
+ doc_ids = self._collect_trace_doc_ids(
+ dense_results=dense_results,
+ sparse_results=sparse_results,
+ fused_results=fused_results,
+ )
metadata_map = await self.kb_db.get_documents_with_metadata_batch(doc_ids)
+ doc_lookup = {
+ doc_id: {
+ "doc_name": metadata["document"].doc_name,
+ "kb_name": metadata["knowledge_base"].kb_name,
+ }
+ for doc_id, metadata in metadata_map.items()
+ }
+
+ retrieval_results = self._build_retrieval_results(
+ fused_results=deduped_results,
+ metadata_map=metadata_map,
+ )
+ rerank_results: list[RetrievalResult] = []
+ first_rerank = self._get_first_rerank_provider(kb_ids, kb_options)
+ if first_rerank and retrieval_results:
+ try:
+ retrieval_results = await self._rerank(
+ query=query,
+ results=retrieval_results,
+ top_k=top_m_final,
+ rerank_provider=first_rerank,
+ )
+ rerank_results = retrieval_results
+ except Exception as e:
+ logger.warning(f"Rerank 执行失败,已跳过重排序并使用融合结果: {e}")
+
+ final_results = retrieval_results[:top_m_final]
+ trace = RetrievalTrace(
+ dense=self._serialize_dense_trace(dense_results, doc_lookup),
+ sparse=self._serialize_sparse_trace(sparse_results, doc_lookup),
+ fusion=self._serialize_fusion_trace(fused_results, doc_lookup),
+ dedup=self._serialize_fusion_trace(deduped_results, doc_lookup),
+ dedup_removed=self._serialize_dedup_removed_trace(
+ dedup_removed_results,
+ doc_lookup,
+ ),
+ rerank=self._serialize_retrieval_trace(rerank_results, "rerank"),
+ final=self._serialize_retrieval_trace(final_results, "final"),
+ )
+ return RetrievalWithTrace(results=final_results, trace=trace)
+
+ def _build_kb_options(
+ self,
+ kb_ids: list[str],
+ kb_id_helper_map: dict[str, KBHelper],
+ *,
+ retrieval_overrides: RetrievalOverrides | None = None,
+ ) -> tuple[list[str], dict]:
+ kb_options: dict = {}
+ valid_kb_ids = []
+ for kb_id in kb_ids:
+ kb_helper = kb_id_helper_map.get(kb_id)
+ if not kb_helper:
+ logger.warning(f"知识库 ID {kb_id} 实例未找到, 已跳过该知识库的检索")
+ continue
+ kb = kb_helper.kb
+ kb_option = {
+ "top_k_dense": kb.top_k_dense or DEFAULT_TOP_K_DENSE,
+ "top_k_sparse": kb.top_k_sparse or DEFAULT_TOP_K_SPARSE,
+ "top_m_final": kb.top_m_final or DEFAULT_TOP_M_FINAL,
+ "vec_db": kb_helper.vec_db,
+ "rerank_provider_id": kb.rerank_provider_id,
+ }
+ if retrieval_overrides:
+ for field_name in (
+ "top_k_dense",
+ "top_k_sparse",
+ "top_m_final",
+ "rerank_provider_id",
+ ):
+ if field_name in retrieval_overrides:
+ kb_option[field_name] = retrieval_overrides[field_name]
+ kb_options[kb_id] = kb_option
+ valid_kb_ids.append(kb_id)
+ return valid_kb_ids, kb_options
+
+ def _collect_trace_doc_ids(
+ self,
+ *,
+ dense_results: list[Result],
+ sparse_results,
+ fused_results,
+ ) -> set[str]:
+ doc_ids = {result.doc_id for result in sparse_results}
+ doc_ids.update(result.doc_id for result in fused_results)
+ for result in dense_results:
+ metadata = self._safe_metadata(result.data.get("metadata"))
+ doc_id = metadata.get("kb_doc_id")
+ if doc_id:
+ doc_ids.add(doc_id)
+ return doc_ids
+
+ def _deduplicate_fused_results(
+ self,
+ fused_results: list[FusedResult],
+ ) -> list[FusedResult]:
+ deduped_results, _ = self._deduplicate_fused_results_with_trace(fused_results)
+ return deduped_results
+
+ def _deduplicate_fused_results_with_trace(
+ self,
+ fused_results: list[FusedResult],
+ ) -> tuple[list[FusedResult], list[dict]]:
+ selected: list[FusedResult] = []
+ removed: list[dict] = []
+ signatures: list[tuple[FusedResult, str, frozenset[str]]] = []
+
+ for result in fused_results:
+ normalized = self._normalize_content_for_dedup(result.content)
+ if not normalized:
+ selected.append(result)
+ continue
+
+ shingles = self._build_content_shingles(normalized)
+ duplicate_of = self._find_duplicate_signature(
+ normalized,
+ shingles,
+ signatures,
+ )
+ if duplicate_of:
+ selected_result, selected_normalized, selected_shingles = duplicate_of
+ removed.append(
+ {
+ "result": result,
+ "duplicate_of": selected_result,
+ "similarity": self._dedup_similarity(
+ normalized,
+ shingles,
+ selected_normalized,
+ selected_shingles,
+ ),
+ },
+ )
+ continue
+
+ selected.append(result)
+ signatures.append((result, normalized, shingles))
+
+ return selected, removed
+
+ @staticmethod
+ def _normalize_content_for_dedup(content: str) -> str:
+ return "".join(str(content or "").lower().split())
+
+ @staticmethod
+ def _build_content_shingles(
+ normalized_content: str,
+ size: int = DEDUP_SHINGLE_SIZE,
+ ) -> frozenset[str]:
+ if not normalized_content:
+ return frozenset()
+ if len(normalized_content) <= size:
+ return frozenset({normalized_content})
+ return frozenset(
+ normalized_content[index : index + size]
+ for index in range(len(normalized_content) - size + 1)
+ )
+
+ @staticmethod
+ def _is_duplicate_signature(
+ normalized: str,
+ shingles: frozenset[str],
+ existing: tuple[FusedResult, str, frozenset[str]],
+ ) -> bool:
+ _, existing_normalized, existing_shingles = existing
+ return (
+ RetrievalManager._dedup_similarity(
+ normalized,
+ shingles,
+ existing_normalized,
+ existing_shingles,
+ )
+ >= DEDUP_JACCARD_THRESHOLD
+ )
+
+ @staticmethod
+ def _dedup_similarity(
+ normalized: str,
+ shingles: frozenset[str],
+ existing_normalized: str,
+ existing_shingles: frozenset[str],
+ ) -> float:
+ if normalized == existing_normalized:
+ return 1.0
+ if not shingles or not existing_shingles:
+ return 0.0
+ union = len(shingles | existing_shingles)
+ if union == 0:
+ return 0.0
+ return len(shingles & existing_shingles) / union
+
+ def _find_duplicate_signature(
+ self,
+ normalized: str,
+ shingles: frozenset[str],
+ signatures: list[tuple[FusedResult, str, frozenset[str]]],
+ ) -> tuple[FusedResult, str, frozenset[str]] | None:
+ for signature in signatures:
+ if self._is_duplicate_signature(normalized, shingles, signature):
+ return signature
+ return None
+
+ def _build_retrieval_results(
+ self,
+ *,
+ fused_results,
+ metadata_map: dict,
+ ) -> list[RetrievalResult]:
retrieval_results = []
for fr in fused_results:
metadata_dict = metadata_map.get(fr.doc_id)
@@ -163,13 +483,22 @@ async def retrieve(
content=fr.content,
score=fr.score,
metadata={
+ **(fr.metadata or {}),
"chunk_index": fr.chunk_index,
"char_count": len(fr.content),
+ "dense_rank": fr.dense_rank,
+ "sparse_rank": fr.sparse_rank,
+ "dense_score": fr.dense_score,
+ "sparse_score": fr.sparse_score,
+ "rrf_score": fr.rrf_score
+ if fr.rrf_score is not None
+ else fr.score,
},
),
)
+ return retrieval_results
- # 5. Rerank
+ def _get_first_rerank_provider(self, kb_ids: list[str], kb_options: dict):
first_rerank = None
for kb_id in kb_ids:
vec_db = kb_options[kb_id]["vec_db"]
@@ -188,18 +517,186 @@ async def retrieve(
):
first_rerank = rerank_provider
break
- if first_rerank and retrieval_results:
- try:
- retrieval_results = await self._rerank(
- query=query,
- results=retrieval_results,
- top_k=top_m_final,
- rerank_provider=first_rerank,
- )
- except Exception as e:
- logger.warning(f"Rerank 执行失败,已跳过重排序并使用融合结果: {e}")
+ return first_rerank
- return retrieval_results[:top_m_final]
+ @staticmethod
+ def _content_preview(content: str, limit: int = 240) -> str:
+ if len(content) <= limit:
+ return content
+ return f"{content[:limit]}..."
+
+ def _serialize_dense_trace(
+ self,
+ dense_results: list[Result],
+ doc_lookup: dict[str, dict],
+ ) -> list[dict]:
+ trace = []
+ for rank, result in enumerate(dense_results, 1):
+ chunk_id = result.data.get("doc_id")
+ metadata = self._safe_metadata(result.data.get("metadata"))
+ doc_id = metadata.get("kb_doc_id")
+ source = doc_lookup.get(doc_id, {})
+ trace.append(
+ {
+ "rank": rank,
+ "chunk_id": chunk_id,
+ "doc_id": doc_id,
+ "doc_name": source.get("doc_name"),
+ "kb_id": metadata.get("kb_id"),
+ "kb_name": source.get("kb_name"),
+ "chunk_index": metadata.get("chunk_index", 0),
+ "score": result.similarity,
+ "dense_score": result.similarity,
+ "title_path": metadata.get("title_path"),
+ "page_number": metadata.get("page_number"),
+ "section_index": metadata.get("section_index"),
+ "content_preview": self._content_preview(
+ result.data.get("text", ""),
+ ),
+ },
+ )
+ return trace
+
+ def _serialize_sparse_trace(
+ self,
+ sparse_results,
+ doc_lookup: dict[str, dict],
+ ) -> list[dict]:
+ trace = []
+ for rank, result in enumerate(sparse_results, 1):
+ source = doc_lookup.get(result.doc_id, {})
+ trace.append(
+ {
+ "rank": rank,
+ "chunk_id": result.chunk_id,
+ "doc_id": result.doc_id,
+ "doc_name": source.get("doc_name"),
+ "kb_id": result.kb_id,
+ "kb_name": source.get("kb_name"),
+ "chunk_index": result.chunk_index,
+ "score": result.score,
+ "sparse_score": result.score,
+ "title_path": (result.metadata or {}).get("title_path"),
+ "page_number": (result.metadata or {}).get("page_number"),
+ "section_index": (result.metadata or {}).get("section_index"),
+ "content_preview": self._content_preview(result.content),
+ },
+ )
+ return trace
+
+ def _serialize_fusion_trace(
+ self,
+ fused_results,
+ doc_lookup: dict[str, dict],
+ ) -> list[dict]:
+ trace = []
+ for rank, result in enumerate(fused_results, 1):
+ source = doc_lookup.get(result.doc_id, {})
+ trace.append(
+ {
+ "rank": rank,
+ "chunk_id": result.chunk_id,
+ "doc_id": result.doc_id,
+ "doc_name": source.get("doc_name"),
+ "kb_id": result.kb_id,
+ "kb_name": source.get("kb_name"),
+ "chunk_index": result.chunk_index,
+ "score": result.score,
+ "dense_rank": result.dense_rank,
+ "sparse_rank": result.sparse_rank,
+ "dense_score": result.dense_score,
+ "sparse_score": result.sparse_score,
+ "rrf_score": result.rrf_score
+ if result.rrf_score is not None
+ else result.score,
+ "title_path": (result.metadata or {}).get("title_path"),
+ "page_number": (result.metadata or {}).get("page_number"),
+ "section_index": (result.metadata or {}).get("section_index"),
+ "content_preview": self._content_preview(result.content),
+ },
+ )
+ return trace
+
+ def _serialize_dedup_removed_trace(
+ self,
+ removed_results: list[dict],
+ doc_lookup: dict[str, dict],
+ ) -> list[dict]:
+ trace = []
+ for rank, removed in enumerate(removed_results, 1):
+ result = removed["result"]
+ duplicate_of = removed["duplicate_of"]
+ source = doc_lookup.get(result.doc_id, {})
+ trace.append(
+ {
+ "rank": rank,
+ "chunk_id": result.chunk_id,
+ "doc_id": result.doc_id,
+ "doc_name": source.get("doc_name"),
+ "kb_id": result.kb_id,
+ "kb_name": source.get("kb_name"),
+ "chunk_index": result.chunk_index,
+ "score": result.score,
+ "dense_rank": result.dense_rank,
+ "sparse_rank": result.sparse_rank,
+ "dense_score": result.dense_score,
+ "sparse_score": result.sparse_score,
+ "rrf_score": result.rrf_score
+ if result.rrf_score is not None
+ else result.score,
+ "duplicate_of_chunk_id": duplicate_of.chunk_id,
+ "duplicate_of_doc_id": duplicate_of.doc_id,
+ "dedup_similarity": removed["similarity"],
+ "title_path": (result.metadata or {}).get("title_path"),
+ "page_number": (result.metadata or {}).get("page_number"),
+ "section_index": (result.metadata or {}).get("section_index"),
+ "content_preview": self._content_preview(result.content),
+ },
+ )
+ return trace
+
+ def _serialize_retrieval_trace(
+ self,
+ results: list[RetrievalResult],
+ stage: str,
+ ) -> list[dict]:
+ trace = []
+ for rank, result in enumerate(results, 1):
+ trace.append(
+ {
+ "rank": rank,
+ "chunk_id": result.chunk_id,
+ "doc_id": result.doc_id,
+ "doc_name": result.doc_name,
+ "kb_id": result.kb_id,
+ "kb_name": result.kb_name,
+ "chunk_index": result.metadata.get("chunk_index", 0),
+ "score": result.score,
+ "dense_rank": result.metadata.get("dense_rank"),
+ "sparse_rank": result.metadata.get("sparse_rank"),
+ "dense_score": result.metadata.get("dense_score"),
+ "sparse_score": result.metadata.get("sparse_score"),
+ "rrf_score": result.metadata.get("rrf_score"),
+ "rerank_score": result.metadata.get("rerank_score"),
+ "title_path": result.metadata.get("title_path"),
+ "page_number": result.metadata.get("page_number"),
+ "section_index": result.metadata.get("section_index"),
+ "stage": stage,
+ "content_preview": self._content_preview(result.content),
+ },
+ )
+ return trace
+
+ @staticmethod
+ def _safe_metadata(raw_metadata) -> dict:
+ if not raw_metadata:
+ return {}
+ if isinstance(raw_metadata, dict):
+ return raw_metadata
+ try:
+ return json.loads(raw_metadata)
+ except Exception:
+ return {}
async def _dense_retrieve(
self,
@@ -209,7 +706,7 @@ async def _dense_retrieve(
):
"""稠密检索 (向量相似度)
- 为每个知识库使用独立的向量数据库进行检索,然后合并结果。
+ 为每个知识库使用独立的向量数据库进行并行检索,然后合并结果。
Args:
query: 查询文本
@@ -220,10 +717,11 @@ async def _dense_retrieve(
List[Result]: 检索结果列表
"""
- all_results: list[Result] = []
- for kb_id in kb_ids:
+ import asyncio
+
+ async def _retrieve_one(kb_id: str) -> list[Result]:
if kb_id not in kb_options:
- continue
+ return []
try:
vec_db: FaissVecDB = kb_options[kb_id]["vec_db"]
dense_k = int(kb_options[kb_id]["top_k_dense"])
@@ -234,17 +732,31 @@ async def _dense_retrieve(
rerank=False, # 稠密检索阶段不进行 rerank
metadata_filters={"kb_id": kb_id},
)
-
- all_results.extend(vec_results)
+ return vec_results
except Exception as e:
- logger.error(f"知识库 {kb_id} 稠密检索失败: {e}", exc_info=True)
+ logger.error(
+ f"知识库 {kb_id} 稠密检索失败: {e}",
+ exc_info=True,
+ )
if len(kb_ids) == 1:
- raise RuntimeError(f"知识库 {kb_id} 稠密检索失败: {e}") from e
+ raise RuntimeError(
+ f"知识库 {kb_id} 稠密检索失败: {e}",
+ ) from e
# multi-KB: skip the faulty KB and continue
+ return []
+
+ tasks = [_retrieve_one(kb_id) for kb_id in kb_ids]
+ results_per_kb = await asyncio.gather(*tasks, return_exceptions=True)
+
+ all_results: list[Result] = []
+ for result in results_per_kb:
+ if isinstance(result, Exception):
+ logger.error(f"稠密检索异常: {result}", exc_info=True)
+ continue
+ all_results.extend(result)
- # 按相似度排序并返回 top_k
+ # 按相似度排序并返回
all_results.sort(key=lambda x: x.similarity, reverse=True)
- # return all_results[: len(all_results) // len(kb_ids)]
return all_results
async def _rerank(
@@ -283,6 +795,7 @@ async def _rerank(
idx = rerank_result.index
if idx < len(results):
result = results[idx]
+ result.metadata["rerank_score"] = rerank_result.relevance_score
result.score = rerank_result.relevance_score
reranked_list.append(result)
diff --git a/astrbot/core/knowledge_base/retrieval/rank_fusion.py b/astrbot/core/knowledge_base/retrieval/rank_fusion.py
index 40afd97484..2dbb1a5bef 100644
--- a/astrbot/core/knowledge_base/retrieval/rank_fusion.py
+++ b/astrbot/core/knowledge_base/retrieval/rank_fusion.py
@@ -6,6 +6,7 @@
import json
from dataclasses import dataclass
+from astrbot.core import logger
from astrbot.core.db.vec_db.base import Result
from astrbot.core.knowledge_base.kb_db_sqlite import KBSQLiteDatabase
from astrbot.core.knowledge_base.retrieval.sparse_retriever import SparseResult
@@ -21,6 +22,12 @@ class FusedResult:
kb_id: str
content: str
score: float
+ metadata: dict | None = None
+ dense_rank: int | None = None
+ sparse_rank: int | None = None
+ dense_score: float | None = None
+ sparse_score: float | None = None
+ rrf_score: float | None = None
class RankFusion:
@@ -62,28 +69,27 @@ async def fuse(
List[FusedResult]: 融合后的结果列表
"""
- # 1. 构建排名映射
+ # 1. Build rank maps keyed by vector-storage chunk IDs.
dense_ranks = {
r.data["doc_id"]: (idx + 1) for idx, r in enumerate(dense_results)
- } # 这里的 doc_id 实际上是 chunk_id
+ }
sparse_ranks = {r.chunk_id: (idx + 1) for idx, r in enumerate(sparse_results)}
- # 2. 收集所有唯一的 ID
- # 需要统一为 chunk_id
+ # 2. Collect all unique chunk IDs.
all_chunk_ids = set()
- vec_doc_id_to_dense: dict[str, Result] = {} # vec_doc_id -> Result
- chunk_id_to_sparse: dict[str, SparseResult] = {} # chunk_id -> SparseResult
+ chunk_id_to_dense: dict[str, Result] = {}
+ chunk_id_to_sparse: dict[str, SparseResult] = {}
# 处理稀疏检索结果
for r in sparse_results:
all_chunk_ids.add(r.chunk_id)
chunk_id_to_sparse[r.chunk_id] = r
- # 处理稠密检索结果 (需要转换 vec_doc_id 到 chunk_id)
+ # Dense results use Document.doc_id, which stores the chunk UUID.
for r in dense_results:
- vec_doc_id = r.data["doc_id"]
- all_chunk_ids.add(vec_doc_id)
- vec_doc_id_to_dense[vec_doc_id] = r
+ chunk_id = r.data["doc_id"]
+ all_chunk_ids.add(chunk_id)
+ chunk_id_to_dense[chunk_id] = r
# 3. 计算 RRF 分数
rrf_scores: dict[str, float] = {}
@@ -108,6 +114,15 @@ async def fuse(
reverse=True,
)[:top_k]
+ if logger.isEnabledFor(10): # DEBUG
+ details = []
+ for cid in sorted_ids[:5]:
+ d_rank = dense_ranks.get(cid, "-")
+ s_rank = sparse_ranks.get(cid, "-")
+ rrf = rrf_scores[cid]
+ details.append(f"{cid[:8]}(d={d_rank},s={s_rank},rrf={rrf:.4f})")
+ logger.debug(f"RRF top-5: {' | '.join(details)}")
+
# 5. 构建融合结果
fused_results = []
for identifier in sorted_ids:
@@ -122,11 +137,21 @@ async def fuse(
kb_id=sr.kb_id,
content=sr.content,
score=rrf_scores[identifier],
+ metadata=sr.metadata,
+ dense_rank=dense_ranks.get(identifier),
+ sparse_rank=sparse_ranks.get(identifier),
+ dense_score=(
+ chunk_id_to_dense[identifier].similarity
+ if identifier in chunk_id_to_dense
+ else None
+ ),
+ sparse_score=sr.score,
+ rrf_score=rrf_scores[identifier],
),
)
- elif identifier in vec_doc_id_to_dense:
+ elif identifier in chunk_id_to_dense:
# 从向量检索获取信息,需要从数据库获取块的详细信息
- vec_result = vec_doc_id_to_dense[identifier]
+ vec_result = chunk_id_to_dense[identifier]
chunk_md = json.loads(vec_result.data["metadata"])
fused_results.append(
FusedResult(
@@ -136,6 +161,12 @@ async def fuse(
kb_id=chunk_md["kb_id"],
content=vec_result.data["text"],
score=rrf_scores[identifier],
+ metadata=chunk_md,
+ dense_rank=dense_ranks.get(identifier),
+ sparse_rank=sparse_ranks.get(identifier),
+ dense_score=vec_result.similarity,
+ sparse_score=None,
+ rrf_score=rrf_scores[identifier],
),
)
diff --git a/astrbot/core/knowledge_base/retrieval/sparse_retriever.py b/astrbot/core/knowledge_base/retrieval/sparse_retriever.py
index f06eb50909..8790d0224c 100644
--- a/astrbot/core/knowledge_base/retrieval/sparse_retriever.py
+++ b/astrbot/core/knowledge_base/retrieval/sparse_retriever.py
@@ -10,6 +10,7 @@
from rank_bm25 import BM25Okapi
+from astrbot.core import logger
from astrbot.core.knowledge_base.kb_db_sqlite import KBSQLiteDatabase
from astrbot.core.knowledge_base.retrieval.tokenizer import (
load_stopwords,
@@ -22,7 +23,10 @@
@dataclass
class SparseResult:
- """稀疏检索结果"""
+ """稀疏检索结果
+
+ score 语义: 越低越相关 (0 = 最佳匹配), 统一按升序排列后送入 RRF 融合。
+ """
chunk_index: int
chunk_id: str
@@ -30,25 +34,15 @@ class SparseResult:
kb_id: str
content: str
score: float
+ metadata: dict | None = None
class SparseRetriever:
- """BM25 稀疏检索器
-
- 职责:
- - 基于关键词的文档检索
- - 使用 BM25 算法计算相关度
- """
+ """BM25 稀疏检索器"""
def __init__(self, kb_db: KBSQLiteDatabase) -> None:
- """初始化稀疏检索器
-
- Args:
- kb_db: 知识库数据库实例
-
- """
self.kb_db = kb_db
- self._index_cache = {} # 缓存 BM25 索引
+ self._index_cache = {}
self.hit_stopwords = load_stopwords(
os.path.join(os.path.dirname(__file__), "hit_stopwords.txt"),
@@ -62,18 +56,13 @@ async def retrieve(
) -> list[SparseResult]:
"""执行稀疏检索
- Args:
- query: 查询文本
- kb_ids: 知识库 ID 列表
- kb_options: 每个知识库的检索选项
-
- Returns:
- List[SparseResult]: 检索结果列表
-
+ 优先使用 FTS5 全文索引; 不可用时回退到内存 BM25。
+ 结果按 score 升序排列 (lower-is-better), 直接喂给 RRF。
"""
fts_results = []
fallback_kb_ids = []
query_tokens = tokenize_text(query, self.hit_stopwords)
+
for kb_id in kb_ids:
vec_db: FaissVecDB | None = kb_options.get(kb_id, {}).get("vec_db")
if not vec_db:
@@ -89,6 +78,7 @@ async def retrieve(
for doc in result:
chunk_md = json.loads(doc["metadata"])
+ # FTS5 bm25(): 0=最佳, 极短文档可能为负值 → clamp 到 0
fts_results.append(
SparseResult(
chunk_id=doc["doc_id"],
@@ -96,7 +86,8 @@ async def retrieve(
doc_id=chunk_md["kb_doc_id"],
kb_id=kb_id,
content=doc["text"],
- score=-float(doc["score"]),
+ score=max(0.0, float(doc["score"])),
+ metadata=chunk_md,
),
)
@@ -107,70 +98,106 @@ async def retrieve(
kb_ids=fallback_kb_ids,
kb_options=kb_options,
)
+
results = fts_results + fallback_results
- results.sort(key=lambda x: x.score, reverse=True)
+ results.sort(key=lambda x: x.score)
+
+ if logger.isEnabledFor(10): # DEBUG
+ fts_top = [f"{r.chunk_id[:8]}={r.score:.4f}" for r in fts_results[:5]]
+ bm_top = [f"{r.chunk_id[:8]}={r.score:.4f}" for r in fallback_results[:5]]
+ merged_top = [f"{r.chunk_id[:8]}={r.score:.4f}" for r in results[:5]]
+ logger.debug(
+ f"Sparse top-5 | FTS5({len(fts_results)}): [{', '.join(fts_top)}] | "
+ f"BM25({len(fallback_results)}): [{', '.join(bm_top)}] | "
+ f"Merged({len(results)}): [{', '.join(merged_top)}]",
+ )
+
return results
+ # BM25 回退路径单次最多加载的文档数,防止 OOM
+ MAX_BM25_DOCS = 10_000
+
async def _retrieve_with_bm25(
self,
query: str,
kb_ids: list[str],
kb_options: dict,
) -> list[SparseResult]:
+ """FTS5 不可用时的 BM25Okapi 回退路径。
+
+ BM25Okapi 原始分值 higher-is-better → 取反统一为 lower-is-better。
+ 单 KB 最多加载 MAX_BM25_DOCS 条 chunk,超限时截断并打 warning。
+ """
top_k_sparse = 0
- chunks = []
+ all_kb_chunks: list[dict] = []
+
for kb_id in kb_ids:
vec_db: FaissVecDB | None = kb_options.get(kb_id, {}).get("vec_db")
if not vec_db:
continue
+ kb_top_k = kb_options.get(kb_id, {}).get("top_k_sparse", 50)
+ top_k_sparse = max(top_k_sparse, kb_top_k)
+
result = await vec_db.document_storage.get_documents(
- metadata_filters={},
- limit=None,
- offset=None,
+ metadata_filters={"kb_id": kb_id},
+ limit=self.MAX_BM25_DOCS,
+ offset=0,
)
+ if len(result) >= self.MAX_BM25_DOCS:
+ logger.warning(
+ f"知识库 {kb_id} 的 BM25 回退检索已触及 {self.MAX_BM25_DOCS} "
+ f"条 chunk 上限,结果可能不完整。建议检查 FTS5 索引状态。",
+ )
chunk_mds = [json.loads(doc["metadata"]) for doc in result]
- result = [
+ kb_chunks = [
{
"chunk_id": doc["doc_id"],
"chunk_index": chunk_md["chunk_index"],
"doc_id": chunk_md["kb_doc_id"],
"kb_id": kb_id,
"text": doc["text"],
+ "kb_top_k": kb_top_k,
+ "metadata": chunk_md,
}
for doc, chunk_md in zip(result, chunk_mds)
]
- chunks.extend(result)
- top_k_sparse += kb_options.get(kb_id, {}).get("top_k_sparse", 50)
+ all_kb_chunks.append(kb_chunks)
- if not chunks:
+ if not any(all_kb_chunks):
return []
- # 2. 准备文档和索引
- corpus = [chunk["text"] for chunk in chunks]
- tokenized_corpus = [tokenize_text(doc, self.hit_stopwords) for doc in corpus]
-
- # 3. 构建 BM25 索引
- bm25 = BM25Okapi(tokenized_corpus)
-
- # 4. 执行检索
- tokenized_query = tokenize_text(query, self.hit_stopwords)
- scores = bm25.get_scores(tokenized_query)
-
- # 5. 排序并返回 Top-K
- results = []
- for idx, score in enumerate(scores):
- chunk = chunks[idx]
- results.append(
- SparseResult(
- chunk_id=chunk["chunk_id"],
- chunk_index=chunk["chunk_index"],
- doc_id=chunk["doc_id"],
- kb_id=chunk["kb_id"],
- content=chunk["text"],
- score=float(score),
- ),
- )
+ # 每个知识库独立计算 BM25 分数并截断,再合并。
+ merged_results: list[SparseResult] = []
+ for kb_chunks in all_kb_chunks:
+ if not kb_chunks:
+ continue
+ kb_top_k = kb_chunks[0]["kb_top_k"]
+
+ corpus = [chunk["text"] for chunk in kb_chunks]
+ tokenized_corpus = [
+ tokenize_text(doc, self.hit_stopwords) for doc in corpus
+ ]
+ bm25 = BM25Okapi(tokenized_corpus)
+
+ tokenized_query = tokenize_text(query, self.hit_stopwords)
+ scores = bm25.get_scores(tokenized_query)
+
+ kb_results: list[SparseResult] = []
+ for idx, score in enumerate(scores):
+ chunk = kb_chunks[idx]
+ kb_results.append(
+ SparseResult(
+ chunk_id=chunk["chunk_id"],
+ chunk_index=chunk["chunk_index"],
+ doc_id=chunk["doc_id"],
+ kb_id=chunk["kb_id"],
+ content=chunk["text"],
+ score=-float(score),
+ metadata=chunk["metadata"],
+ ),
+ )
+
+ merged_results.extend(sorted(kb_results, key=lambda x: x.score)[:kb_top_k])
- results.sort(key=lambda x: x.score, reverse=True)
- # return results[: len(results) // len(kb_ids)]
- return results[:top_k_sparse]
+ merged_results.sort(key=lambda x: x.score)
+ return merged_results[:top_k_sparse]
diff --git a/astrbot/core/tools/knowledge_base_tools.py b/astrbot/core/tools/knowledge_base_tools.py
index e082fd4253..da00c18f47 100644
--- a/astrbot/core/tools/knowledge_base_tools.py
+++ b/astrbot/core/tools/knowledge_base_tools.py
@@ -53,7 +53,7 @@ async def retrieve_knowledge_base(
f"[知识库] 会话 {umo} 配置的以下知识库无效: {invalid_kb_ids}",
)
if not kb_names:
- return None
+ return "会话配置的知识库均不存在或未加载,请检查知识库设置。"
logger.debug(f"[知识库] 使用会话级配置,知识库数量: {len(kb_names)}")
else:
kb_names = config.get("kb_names", [])
diff --git a/astrbot/dashboard/routes/knowledge_base.py b/astrbot/dashboard/routes/knowledge_base.py
index 1b6f7a435d..ca97f296ea 100644
--- a/astrbot/dashboard/routes/knowledge_base.py
+++ b/astrbot/dashboard/routes/knowledge_base.py
@@ -11,6 +11,29 @@
from astrbot.core import logger
from astrbot.core.core_lifecycle import AstrBotCoreLifecycle
+from astrbot.core.knowledge_base.capabilities import (
+ ALLOWED_UPLOAD_EXTENSIONS,
+ DEFAULT_CHUNK_OVERLAP,
+ DEFAULT_CHUNK_PAGE_SIZE,
+ DEFAULT_CHUNK_SIZE,
+ DEFAULT_DOCUMENT_PAGE_SIZE,
+ DEFAULT_INDEX_TYPE,
+ DEFAULT_KB_PAGE_SIZE,
+ DEFAULT_TOP_K_DENSE,
+ DEFAULT_TOP_K_SPARSE,
+ DEFAULT_TOP_M_FINAL,
+ DEFAULT_UPLOAD_BATCH_SIZE,
+ DEFAULT_UPLOAD_MAX_RETRIES,
+ DEFAULT_UPLOAD_TASKS_LIMIT,
+ DOCUMENT_FILTER_SOURCE_TYPES,
+ DOCUMENT_FILTER_STATUSES,
+ MAX_BATCH_DELETE_DOCUMENTS,
+ MAX_BATCH_REBUILD_DOCUMENTS,
+ MAX_RETRIEVE_TOP_K,
+ MAX_UPLOAD_FILE_SIZE,
+ MAX_UPLOAD_FILES,
+ get_knowledge_base_capabilities,
+)
from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
@@ -41,12 +64,16 @@ def __init__(
# 注册路由
self.routes = {
# 知识库管理
+ "/kb/capabilities": ("GET", self.get_capabilities),
"/kb/list": ("GET", self.list_kbs),
"/kb/create": ("POST", self.create_kb),
"/kb/get": ("GET", self.get_kb),
"/kb/update": ("POST", self.update_kb),
"/kb/delete": ("POST", self.delete_kb),
"/kb/stats": ("GET", self.get_kb_stats),
+ "/kb/consistency/check": ("GET", self.check_kb_consistency),
+ "/kb/consistency/repair": ("POST", self.repair_kb_consistency),
+ "/kb/rebuild": ("POST", self.rebuild_kb),
# 文档管理
"/kb/document/list": ("GET", self.list_documents),
"/kb/document/upload": ("POST", self.upload_document),
@@ -54,9 +81,15 @@ def __init__(
"/kb/document/upload/url": ("POST", self.upload_document_from_url),
"/kb/document/upload/progress": ("GET", self.get_upload_progress),
"/kb/document/get": ("GET", self.get_document),
+ "/kb/document/rebuild": ("POST", self.rebuild_document),
+ "/kb/document/batch-rebuild": ("POST", self.batch_rebuild_documents),
"/kb/document/delete": ("POST", self.delete_document),
+ "/kb/document/batch-delete": ("POST", self.batch_delete_documents),
+ "/kb/task/get": ("GET", self.get_task),
+ "/kb/task/list": ("GET", self.list_tasks),
# # 块管理
"/kb/chunk/list": ("GET", self.list_chunks),
+ "/kb/chunk/context": ("GET", self.get_chunk_context),
"/kb/chunk/delete": ("POST", self.delete_chunk),
# # 多媒体管理
# "/kb/media/list": ("GET", self.list_media),
@@ -69,6 +102,77 @@ def __init__(
def _get_kb_manager(self):
return self.core_lifecycle.kb_manager
+ def _get_kb_db(self):
+ if not hasattr(self, "core_lifecycle"):
+ return None
+ kb_manager = self._get_kb_manager()
+ return getattr(kb_manager, "kb_db", None)
+
+ @staticmethod
+ def _get_positive_query_int(name: str, default: int) -> int:
+ value = request.args.get(name, default, type=int)
+ return max(value if value is not None else default, 1)
+
+ async def get_capabilities(self):
+ """Return knowledge base capabilities, defaults, and limits."""
+ return Response().ok(get_knowledge_base_capabilities()).__dict__
+
+ async def _create_persistent_task(
+ self,
+ *,
+ task_id: str,
+ kb_id: str | None,
+ task_type: str,
+ status: str,
+ progress: dict | None = None,
+ ) -> None:
+ kb_db = self._get_kb_db()
+ if not kb_db or not kb_id:
+ return
+ try:
+ await kb_db.create_ingestion_task(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type=task_type,
+ status=status,
+ progress_stage=(progress or {}).get("stage"),
+ progress_current=(progress or {}).get("current", 0),
+ progress_total=(progress or {}).get("total", 100),
+ progress=progress,
+ )
+ except Exception as e:
+ logger.warning(f"创建知识库持久任务记录失败 {task_id}: {e}")
+
+ async def _update_persistent_task(self, task_id: str, **updates) -> None:
+ kb_db = self._get_kb_db()
+ if not kb_db:
+ return
+ try:
+ await kb_db.update_ingestion_task(task_id, **updates)
+ except Exception as e:
+ logger.warning(f"更新知识库持久任务记录失败 {task_id}: {e}")
+
+ async def _get_persistent_task(self, task_id: str) -> dict | None:
+ kb_db = self._get_kb_db()
+ if not kb_db:
+ return None
+ try:
+ return await kb_db.get_ingestion_task(task_id)
+ except Exception as e:
+ logger.warning(f"读取知识库持久任务记录失败 {task_id}: {e}")
+ return None
+
+ def _get_persistent_progress_updates(self, task_id: str) -> dict:
+ progress = self.upload_progress.get(task_id)
+ if not progress:
+ return {}
+ return {
+ "progress_stage": progress.get("stage"),
+ "progress_current": progress.get("current", 0),
+ "progress_total": progress.get("total", 100),
+ "progress": progress,
+ }
+
def _init_task(self, task_id: str, status: str = "pending") -> None:
self.upload_tasks[task_id] = {
"status": status,
@@ -87,6 +191,21 @@ def _set_task_result(
if task_id in self.upload_progress:
self.upload_progress[task_id]["status"] = status
+ def _cleanup_task(self, task_id: str) -> None:
+ """清理已完成/失败的任务,释放内存。幂等操作。"""
+ self.upload_tasks.pop(task_id, None)
+ self.upload_progress.pop(task_id, None)
+
+ async def _schedule_delayed_cleanup(
+ self, task_id: str, delay_seconds: int = 300
+ ) -> None:
+ """延迟清理任务,作为客户端不轮询时的兜底机制。"""
+ try:
+ await asyncio.sleep(delay_seconds)
+ except asyncio.CancelledError:
+ return
+ self._cleanup_task(task_id)
+
def _update_progress(
self,
task_id: str,
@@ -114,6 +233,16 @@ def _update_progress(
if total is not None:
p["total"] = total
+ async def _persist_progress(self, task_id: str) -> None:
+ progress = self.upload_progress.get(task_id)
+ if not progress:
+ return
+ await self._update_persistent_task(
+ task_id,
+ status=progress.get("status"),
+ **self._get_persistent_progress_updates(task_id),
+ )
+
def _make_progress_callback(self, task_id: str, file_idx: int, file_name: str):
async def _callback(stage: str, current: int, total: int) -> None:
self._update_progress(
@@ -125,16 +254,140 @@ async def _callback(stage: str, current: int, total: int) -> None:
current=current,
total=total,
)
+ await self._persist_progress(task_id)
return _callback
@staticmethod
def _format_failed_doc_error(file_name: str, error: Exception) -> str:
message = str(error).strip() or "上传失败:发生未知错误。"
- if message.startswith(file_name):
+ if message.startswith(f"{file_name}:"):
return message
return f"{file_name}: {message}"
+ @staticmethod
+ def _resolve_batch_task_status(success_count: int, failed_count: int) -> str:
+ if failed_count == 0:
+ return "completed"
+ if success_count > 0:
+ return "partial_failed"
+ return "failed"
+
+ @staticmethod
+ def _build_batch_failure_error(
+ failed_docs: list[dict],
+ success_count: int = 0,
+ action: str = "上传",
+ ) -> str | None:
+ if not failed_docs:
+ return None
+ if len(failed_docs) == 1:
+ return failed_docs[0].get("error") or "上传失败:发生未知错误。"
+ if success_count > 0:
+ return f"部分文档{action}失败,共 {len(failed_docs)} 个失败。"
+ return f"所有文档{action}失败,共 {len(failed_docs)} 个失败。"
+
+ @staticmethod
+ def _format_size_limit(size_bytes: int) -> str:
+ size_mb = size_bytes / (1024 * 1024)
+ if size_mb.is_integer():
+ return f"{int(size_mb)}MB"
+ return f"{size_mb:.2f}MB"
+
+ @staticmethod
+ def _coerce_optional_int(value: Any, field_name: str) -> int | None:
+ if value in (None, ""):
+ return None
+ try:
+ return int(value)
+ except (TypeError, ValueError) as e:
+ raise ValueError(f"{field_name} 必须是整数") from e
+
+ @staticmethod
+ def _coerce_optional_bool(value: Any, field_name: str) -> bool:
+ if isinstance(value, bool):
+ return value
+ if value in (None, ""):
+ return False
+ if isinstance(value, str):
+ lowered = value.strip().lower()
+ if lowered in {"true", "1", "yes", "on"}:
+ return True
+ if lowered in {"false", "0", "no", "off"}:
+ return False
+ raise ValueError(f"{field_name} 必须是布尔值")
+
+ @staticmethod
+ def _validate_chunk_options(
+ *,
+ chunk_size: int | None,
+ chunk_overlap: int | None,
+ ) -> None:
+ if chunk_size is not None and chunk_size <= 0:
+ raise ValueError("chunk_size 必须大于 0")
+ if chunk_overlap is not None and chunk_overlap < 0:
+ raise ValueError("chunk_overlap 不能为负数")
+ if (
+ chunk_size is not None
+ and chunk_overlap is not None
+ and chunk_overlap >= chunk_size
+ ):
+ raise ValueError("chunk_overlap 必须小于 chunk_size")
+
+ @staticmethod
+ def _validate_positive_int(value: int | None, field_name: str) -> None:
+ if value is not None and value <= 0:
+ raise ValueError(f"{field_name} 必须大于 0")
+
+ @classmethod
+ def _validate_kb_options(
+ cls,
+ *,
+ chunk_size: int | None,
+ chunk_overlap: int | None,
+ top_k_dense: int | None,
+ top_k_sparse: int | None,
+ top_m_final: int | None,
+ index_type: str | None,
+ ) -> None:
+ cls._validate_chunk_options(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+ cls._validate_positive_int(top_k_dense, "top_k_dense")
+ cls._validate_positive_int(top_k_sparse, "top_k_sparse")
+ cls._validate_positive_int(top_m_final, "top_m_final")
+ if index_type is not None and index_type not in {"flat", "hnsw"}:
+ raise ValueError("index_type 必须是 flat 或 hnsw")
+
+ @classmethod
+ def _validate_upload_options(
+ cls,
+ *,
+ chunk_size: int,
+ chunk_overlap: int,
+ batch_size: int,
+ tasks_limit: int,
+ max_retries: int,
+ ) -> None:
+ cls._validate_chunk_options(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
+ cls._validate_positive_int(batch_size, "batch_size")
+ cls._validate_positive_int(tasks_limit, "tasks_limit")
+ if max_retries < 0:
+ raise ValueError("max_retries 不能为负数")
+
+ @staticmethod
+ def _validate_upload_file(file_name: str, file_size: int) -> None:
+ file_type = file_name.rsplit(".", 1)[-1].lower() if "." in file_name else ""
+ if file_type not in ALLOWED_UPLOAD_EXTENSIONS:
+ raise ValueError(f"不支持的文件类型: {file_name}")
+ if file_size > MAX_UPLOAD_FILE_SIZE:
+ limit = KnowledgeBaseRoute._format_size_limit(MAX_UPLOAD_FILE_SIZE)
+ raise ValueError(f"文件超过 {limit} 限制: {file_name}")
+
async def _background_upload_task(
self,
task_id: str,
@@ -158,6 +411,7 @@ async def _background_upload_task(
"current": 0,
"total": 100,
}
+ await self._persist_progress(task_id)
uploaded_docs = []
failed_docs = []
@@ -174,6 +428,7 @@ async def _background_upload_task(
current=0,
total=100,
)
+ await self._persist_progress(task_id)
# 创建进度回调函数
progress_callback = self._make_progress_callback(
@@ -214,12 +469,42 @@ async def _background_upload_task(
"failed_count": len(failed_docs),
}
- self._set_task_result(task_id, "completed", result=result)
+ task_status = self._resolve_batch_task_status(
+ len(uploaded_docs),
+ len(failed_docs),
+ )
+ task_error = self._build_batch_failure_error(
+ failed_docs,
+ success_count=len(uploaded_docs),
+ action="上传",
+ )
+ self._set_task_result(
+ task_id,
+ task_status,
+ result=result,
+ error=task_error,
+ )
+ await self._update_persistent_task(
+ task_id,
+ status=task_status,
+ result=result,
+ error=task_error,
+ **self._get_persistent_progress_updates(task_id),
+ )
except Exception as e:
logger.error(f"后台上传任务 {task_id} 失败: {e}")
logger.error(traceback.format_exc())
self._set_task_result(task_id, "failed", error=str(e))
+ await self._update_persistent_task(
+ task_id,
+ status="failed",
+ error=str(e),
+ **self._get_persistent_progress_updates(task_id),
+ )
+ finally:
+ # 兜底清理:防止客户端不轮询 get_upload_progress 导致内存泄漏
+ asyncio.create_task(self._schedule_delayed_cleanup(task_id))
async def _background_import_task(
self,
@@ -242,6 +527,7 @@ async def _background_import_task(
"current": 0,
"total": 100,
}
+ await self._persist_progress(task_id)
uploaded_docs = []
failed_docs = []
@@ -261,6 +547,7 @@ async def _background_import_task(
current=0,
total=100,
)
+ await self._persist_progress(task_id)
# 创建进度回调函数
progress_callback = self._make_progress_callback(
@@ -282,6 +569,8 @@ async def _background_import_task(
max_retries=max_retries,
progress_callback=progress_callback,
pre_chunked_text=chunks,
+ source_type="import",
+ source_uri=file_name,
)
uploaded_docs.append(doc.model_dump())
@@ -304,32 +593,327 @@ async def _background_import_task(
"failed_count": len(failed_docs),
}
- self._set_task_result(task_id, "completed", result=result)
+ task_status = self._resolve_batch_task_status(
+ len(uploaded_docs),
+ len(failed_docs),
+ )
+ task_error = self._build_batch_failure_error(
+ failed_docs,
+ success_count=len(uploaded_docs),
+ action="导入",
+ )
+ self._set_task_result(
+ task_id,
+ task_status,
+ result=result,
+ error=task_error,
+ )
+ await self._update_persistent_task(
+ task_id,
+ status=task_status,
+ result=result,
+ error=task_error,
+ **self._get_persistent_progress_updates(task_id),
+ )
except Exception as e:
logger.error(f"后台导入任务 {task_id} 失败: {e}")
logger.error(traceback.format_exc())
self._set_task_result(task_id, "failed", error=str(e))
+ await self._update_persistent_task(
+ task_id,
+ status="failed",
+ error=str(e),
+ **self._get_persistent_progress_updates(task_id),
+ )
+ finally:
+ asyncio.create_task(self._schedule_delayed_cleanup(task_id))
+
+ async def _background_rebuild_document_task(
+ self,
+ task_id: str,
+ kb_helper,
+ doc_id: str,
+ chunk_size: int | None,
+ chunk_overlap: int | None,
+ batch_size: int,
+ tasks_limit: int,
+ max_retries: int,
+ ) -> None:
+ """Run a single document rebuild in the background."""
+ try:
+ self._init_task(task_id, status="processing")
+ self.upload_progress[task_id] = {
+ "status": "processing",
+ "file_index": 0,
+ "file_total": 1,
+ "file_name": doc_id,
+ "stage": "rebuilding",
+ "current": 0,
+ "total": 100,
+ }
+ await self._persist_progress(task_id)
+
+ progress_callback = self._make_progress_callback(task_id, 0, doc_id)
+ doc = await kb_helper.rebuild_document(
+ doc_id,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ )
+
+ result = {
+ "task_id": task_id,
+ "rebuilt": [doc.model_dump()],
+ "failed": [],
+ "total": 1,
+ "success_count": 1,
+ "failed_count": 0,
+ }
+ self._update_progress(
+ task_id,
+ status="completed",
+ file_index=0,
+ file_name=doc_id,
+ stage="completed",
+ current=100,
+ total=100,
+ )
+ self._set_task_result(task_id, "completed", result=result)
+ await self._update_persistent_task(
+ task_id,
+ status="completed",
+ result=result,
+ error=None,
+ **self._get_persistent_progress_updates(task_id),
+ )
+
+ except Exception as e:
+ logger.error(f"后台重建文档任务 {task_id} 失败: {e}")
+ logger.error(traceback.format_exc())
+ self._set_task_result(task_id, "failed", error=str(e))
+ await self._update_persistent_task(
+ task_id,
+ status="failed",
+ error=str(e),
+ **self._get_persistent_progress_updates(task_id),
+ )
+ finally:
+ asyncio.create_task(self._schedule_delayed_cleanup(task_id))
+
+ async def _background_rebuild_kb_task(
+ self,
+ task_id: str,
+ kb_helper,
+ chunk_size: int | None,
+ chunk_overlap: int | None,
+ batch_size: int,
+ tasks_limit: int,
+ max_retries: int,
+ ) -> None:
+ """Run a full knowledge base rebuild in the background."""
+ kb_name = getattr(getattr(kb_helper, "kb", None), "kb_name", "knowledge base")
+ try:
+ self._init_task(task_id, status="processing")
+ self.upload_progress[task_id] = {
+ "status": "processing",
+ "file_index": 0,
+ "file_total": 1,
+ "file_name": kb_name,
+ "stage": "rebuilding",
+ "current": 0,
+ "total": 100,
+ }
+ await self._persist_progress(task_id)
+
+ progress_callback = self._make_progress_callback(
+ task_id,
+ 0,
+ kb_name,
+ )
+ result = await kb_helper.rebuild_all_documents(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ )
+ result = {
+ "task_id": task_id,
+ **result,
+ }
+ task_status = self._resolve_batch_task_status(
+ int(result.get("success_count") or 0),
+ int(result.get("failed_count") or 0),
+ )
+ task_error = self._build_batch_failure_error(
+ result.get("failed") or [],
+ success_count=int(result.get("success_count") or 0),
+ action="重建",
+ )
+ completed_total = max(int(result.get("total") or 0), 1)
+ self._update_progress(
+ task_id,
+ status=task_status,
+ file_index=0,
+ file_name=kb_name,
+ stage="completed",
+ current=completed_total,
+ total=completed_total,
+ )
+ self._set_task_result(
+ task_id,
+ task_status,
+ result=result,
+ error=task_error,
+ )
+ await self._update_persistent_task(
+ task_id,
+ status=task_status,
+ result=result,
+ error=task_error,
+ **self._get_persistent_progress_updates(task_id),
+ )
+
+ except Exception as e:
+ logger.error(f"后台重建知识库任务 {task_id} 失败: {e}")
+ logger.error(traceback.format_exc())
+ self._set_task_result(task_id, "failed", error=str(e))
+ await self._update_persistent_task(
+ task_id,
+ status="failed",
+ error=str(e),
+ **self._get_persistent_progress_updates(task_id),
+ )
+ finally:
+ asyncio.create_task(self._schedule_delayed_cleanup(task_id))
+
+ async def _background_rebuild_documents_task(
+ self,
+ task_id: str,
+ kb_helper,
+ doc_ids: list[str],
+ chunk_size: int | None,
+ chunk_overlap: int | None,
+ batch_size: int,
+ tasks_limit: int,
+ max_retries: int,
+ ) -> None:
+ """Run selected document rebuilds in the background."""
+ total = max(len(doc_ids), 1)
+ task_name = f"{len(doc_ids)} selected documents"
+ try:
+ self._init_task(task_id, status="processing")
+ self.upload_progress[task_id] = {
+ "status": "processing",
+ "file_index": 0,
+ "file_total": total,
+ "file_name": task_name,
+ "stage": "rebuilding",
+ "current": 0,
+ "total": total,
+ }
+ await self._persist_progress(task_id)
+
+ progress_callback = self._make_progress_callback(
+ task_id,
+ 0,
+ task_name,
+ )
+ result = await kb_helper.rebuild_documents(
+ doc_ids,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ progress_callback=progress_callback,
+ )
+ result = {
+ "task_id": task_id,
+ **result,
+ }
+ task_status = self._resolve_batch_task_status(
+ int(result.get("success_count") or 0),
+ int(result.get("failed_count") or 0),
+ )
+ task_error = self._build_batch_failure_error(
+ result.get("failed") or [],
+ success_count=int(result.get("success_count") or 0),
+ action="重建",
+ )
+ completed_total = max(int(result.get("total") or 0), 1)
+ self._update_progress(
+ task_id,
+ status=task_status,
+ file_index=0,
+ file_name=task_name,
+ stage="completed",
+ current=completed_total,
+ total=completed_total,
+ )
+ self._set_task_result(
+ task_id,
+ task_status,
+ result=result,
+ error=task_error,
+ )
+ await self._update_persistent_task(
+ task_id,
+ status=task_status,
+ result=result,
+ error=task_error,
+ **self._get_persistent_progress_updates(task_id),
+ )
+
+ except Exception as e:
+ logger.error(f"后台批量重建文档任务 {task_id} 失败: {e}")
+ logger.error(traceback.format_exc())
+ self._set_task_result(task_id, "failed", error=str(e))
+ await self._update_persistent_task(
+ task_id,
+ status="failed",
+ error=str(e),
+ **self._get_persistent_progress_updates(task_id),
+ )
+ finally:
+ asyncio.create_task(self._schedule_delayed_cleanup(task_id))
async def list_kbs(self):
"""获取知识库列表
Query 参数:
- page: 页码 (默认 1)
- - page_size: 每页数量 (默认 20)
+ - page_size: 每页数量
- refresh_stats: 是否刷新统计信息 (默认 false,首次加载时可设为 true)
"""
try:
kb_manager = self._get_kb_manager()
- page = request.args.get("page", 1, type=int)
- page_size = request.args.get("page_size", 20, type=int)
+ page = self._get_positive_query_int("page", 1)
+ page_size = self._get_positive_query_int(
+ "page_size",
+ DEFAULT_KB_PAGE_SIZE,
+ )
+ refresh_stats = request.args.get("refresh_stats") == "true"
+ kb_db = self._get_kb_db()
kbs = await kb_manager.list_kbs()
+ total = len(kbs)
+ start = (page - 1) * page_size
+ paged_kbs = kbs[start : start + page_size]
# 转换为字典列表
kb_list = []
- for kb in kbs:
+ for kb in paged_kbs:
kb_dict = kb.model_dump()
+ if refresh_stats and kb_db and hasattr(kb_db, "get_kb_stats"):
+ stats = await kb_db.get_kb_stats(kb.kb_id)
+ if stats:
+ kb_dict.update(stats)
# include init_error from KBHelper if present
kb_helper = await kb_manager.get_kb(kb.kb_id)
if kb_helper and kb_helper.init_error:
@@ -338,7 +922,14 @@ async def list_kbs(self):
return (
Response()
- .ok({"items": kb_list, "page": page, "page_size": page_size})
+ .ok(
+ {
+ "items": kb_list,
+ "page": page,
+ "page_size": page_size,
+ "total": total,
+ },
+ )
.__dict__
)
except ValueError as e:
@@ -374,11 +965,40 @@ async def create_kb(self):
emoji = data.get("emoji")
embedding_provider_id = data.get("embedding_provider_id")
rerank_provider_id = data.get("rerank_provider_id")
- chunk_size = data.get("chunk_size")
- chunk_overlap = data.get("chunk_overlap")
- top_k_dense = data.get("top_k_dense")
- top_k_sparse = data.get("top_k_sparse")
- top_m_final = data.get("top_m_final")
+ chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size")
+ chunk_overlap = self._coerce_optional_int(
+ data.get("chunk_overlap"),
+ "chunk_overlap",
+ )
+ top_k_dense = self._coerce_optional_int(
+ data.get("top_k_dense"),
+ "top_k_dense",
+ )
+ top_k_sparse = self._coerce_optional_int(
+ data.get("top_k_sparse"),
+ "top_k_sparse",
+ )
+ top_m_final = self._coerce_optional_int(
+ data.get("top_m_final"),
+ "top_m_final",
+ )
+ index_type = data.get("index_type")
+ self._validate_kb_options(
+ chunk_size=chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE,
+ chunk_overlap=chunk_overlap
+ if chunk_overlap is not None
+ else DEFAULT_CHUNK_OVERLAP,
+ top_k_dense=top_k_dense
+ if top_k_dense is not None
+ else DEFAULT_TOP_K_DENSE,
+ top_k_sparse=top_k_sparse
+ if top_k_sparse is not None
+ else DEFAULT_TOP_K_SPARSE,
+ top_m_final=top_m_final
+ if top_m_final is not None
+ else DEFAULT_TOP_M_FINAL,
+ index_type=index_type if index_type is not None else DEFAULT_INDEX_TYPE,
+ )
# pre-check embedding dim
if not embedding_provider_id:
@@ -433,6 +1053,7 @@ async def create_kb(self):
top_k_dense=top_k_dense,
top_k_sparse=top_k_sparse,
top_m_final=top_m_final,
+ index_type=index_type,
)
kb = kb_helper.kb
@@ -495,34 +1116,72 @@ async def update_kb(self):
if not kb_id:
return Response().error("缺少参数 kb_id").__dict__
+ update_fields = [
+ "kb_name",
+ "description",
+ "emoji",
+ "embedding_provider_id",
+ "rerank_provider_id",
+ "chunk_size",
+ "chunk_overlap",
+ "top_k_dense",
+ "top_k_sparse",
+ "top_m_final",
+ "index_type",
+ ]
+ if not any(field in data for field in update_fields):
+ return Response().error("至少需要提供一个更新字段").__dict__
+
kb_name = data.get("kb_name")
description = data.get("description")
emoji = data.get("emoji")
embedding_provider_id = data.get("embedding_provider_id")
- rerank_provider_id = data.get("rerank_provider_id")
- chunk_size = data.get("chunk_size")
- chunk_overlap = data.get("chunk_overlap")
- top_k_dense = data.get("top_k_dense")
- top_k_sparse = data.get("top_k_sparse")
- top_m_final = data.get("top_m_final")
-
- # 检查是否至少提供了一个更新字段
- if all(
- v is None
- for v in [
- kb_name,
- description,
- emoji,
- embedding_provider_id,
- rerank_provider_id,
- chunk_size,
- chunk_overlap,
- top_k_dense,
- top_k_sparse,
- top_m_final,
- ]
- ):
- return Response().error("至少需要提供一个更新字段").__dict__
+ rerank_provider_provided = "rerank_provider_id" in data
+ rerank_provider_id = (
+ data.get("rerank_provider_id") if rerank_provider_provided else None
+ )
+ chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size")
+ chunk_overlap = self._coerce_optional_int(
+ data.get("chunk_overlap"),
+ "chunk_overlap",
+ )
+ top_k_dense = self._coerce_optional_int(
+ data.get("top_k_dense"),
+ "top_k_dense",
+ )
+ top_k_sparse = self._coerce_optional_int(
+ data.get("top_k_sparse"),
+ "top_k_sparse",
+ )
+ top_m_final = self._coerce_optional_int(
+ data.get("top_m_final"),
+ "top_m_final",
+ )
+ index_type = data.get("index_type")
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+ current_kb = kb_helper.kb
+ self._validate_kb_options(
+ chunk_size=chunk_size
+ if chunk_size is not None
+ else current_kb.chunk_size,
+ chunk_overlap=chunk_overlap
+ if chunk_overlap is not None
+ else current_kb.chunk_overlap,
+ top_k_dense=top_k_dense
+ if top_k_dense is not None
+ else current_kb.top_k_dense,
+ top_k_sparse=top_k_sparse
+ if top_k_sparse is not None
+ else current_kb.top_k_sparse,
+ top_m_final=top_m_final
+ if top_m_final is not None
+ else current_kb.top_m_final,
+ index_type=index_type
+ if index_type is not None
+ else current_kb.index_type,
+ )
kb_helper = await kb_manager.update_kb(
kb_id=kb_id,
@@ -530,12 +1189,17 @@ async def update_kb(self):
description=description,
emoji=emoji,
embedding_provider_id=embedding_provider_id,
- rerank_provider_id=rerank_provider_id,
+ **(
+ {"rerank_provider_id": rerank_provider_id}
+ if rerank_provider_provided
+ else {}
+ ),
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
top_k_dense=top_k_dense,
top_k_sparse=top_k_sparse,
top_m_final=top_m_final,
+ index_type=index_type,
)
if not kb_helper:
@@ -594,12 +1258,28 @@ async def get_kb_stats(self):
if not kb_helper:
return Response().error("知识库不存在").__dict__
kb = kb_helper.kb
+ kb_db = self._get_kb_db()
+ if kb_db and hasattr(kb_db, "get_kb_stats"):
+ stats = await kb_db.get_kb_stats(kb_id)
+ if stats is not None:
+ return Response().ok(stats).__dict__
stats = {
"kb_id": kb.kb_id,
"kb_name": kb.kb_name,
"doc_count": kb.doc_count,
"chunk_count": kb.chunk_count,
+ "document_count": kb.doc_count,
+ "ready_document_count": kb.doc_count,
+ "failed_document_count": 0,
+ "pending_document_count": 0,
+ "processing_document_count": 0,
+ "indexed_chunk_count": kb.chunk_count,
+ "document_chunk_count": kb.chunk_count,
+ "media_count": 0,
+ "source_file_count": 0,
+ "storage_bytes": 0,
+ "status_counts": {"ready": kb.doc_count},
"created_at": kb.created_at.isoformat(),
"updated_at": kb.updated_at.isoformat(),
}
@@ -613,38 +1293,121 @@ async def get_kb_stats(self):
logger.error(traceback.format_exc())
return Response().error(f"获取知识库统计失败: {e!s}").__dict__
- # ===== 文档管理 API =====
-
- async def list_documents(self):
- """获取文档列表
-
- Query 参数:
- - kb_id: 知识库 ID (必填)
- - page: 页码 (默认 1)
- - page_size: 每页数量 (默认 20)
- """
+ async def check_kb_consistency(self):
+ """Check consistency across metadata, source files, and indexed chunks."""
try:
kb_manager = self._get_kb_manager()
kb_id = request.args.get("kb_id")
if not kb_id:
return Response().error("缺少参数 kb_id").__dict__
+
kb_helper = await kb_manager.get_kb(kb_id)
if not kb_helper:
return Response().error("知识库不存在").__dict__
- page = request.args.get("page", 1, type=int)
- page_size = request.args.get("page_size", 100, type=int)
+ report = await kb_helper.check_consistency()
+ return Response().ok(report).__dict__
- offset = (page - 1) * page_size
- limit = page_size
+ except ValueError as e:
+ return Response().error(str(e)).__dict__
+ except Exception as e:
+ logger.error(f"检查知识库一致性失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"检查知识库一致性失败: {e!s}").__dict__
- doc_list = await kb_helper.list_documents(offset=offset, limit=limit)
+ async def repair_kb_consistency(self):
+ """Repair low-risk consistency issues for a knowledge base."""
+ try:
+ kb_manager = self._get_kb_manager()
+ data = await request.json
+
+ kb_id = data.get("kb_id")
+ if not kb_id:
+ return Response().error("缺少参数 kb_id").__dict__
+
+ repair_types = data.get("repair_types")
+ if repair_types is not None and not isinstance(repair_types, list):
+ return Response().error("repair_types 格式错误").__dict__
+
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+
+ report = await kb_helper.repair_consistency(repair_types=repair_types)
+ return Response().ok(report).__dict__
+
+ except ValueError as e:
+ return Response().error(str(e)).__dict__
+ except Exception as e:
+ logger.error(f"修复知识库一致性失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"修复知识库一致性失败: {e!s}").__dict__
+
+ # ===== 文档管理 API =====
+
+ async def list_documents(self):
+ """获取文档列表
+
+ Query 参数:
+ - kb_id: 知识库 ID (必填)
+ - page: 页码 (默认 1)
+ - page_size: 每页数量
+ """
+ try:
+ kb_manager = self._get_kb_manager()
+ kb_id = request.args.get("kb_id")
+ if not kb_id:
+ return Response().error("缺少参数 kb_id").__dict__
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+
+ page = self._get_positive_query_int("page", 1)
+ page_size = self._get_positive_query_int(
+ "page_size",
+ DEFAULT_DOCUMENT_PAGE_SIZE,
+ )
+ search = (request.args.get("search") or "").strip() or None
+ status = (request.args.get("status") or "").strip() or None
+ source_type = (request.args.get("source_type") or "").strip() or None
+ if status and status not in DOCUMENT_FILTER_STATUSES:
+ return Response().error("status 参数无效").__dict__
+ if source_type and source_type not in DOCUMENT_FILTER_SOURCE_TYPES:
+ return Response().error("source_type 参数无效").__dict__
+
+ offset = (page - 1) * page_size
+ limit = page_size
+
+ doc_list = await kb_helper.list_documents(
+ offset=offset,
+ limit=limit,
+ search=search,
+ status=status,
+ source_type=source_type,
+ )
+ total = await kb_helper.count_documents(
+ search=search,
+ status=status,
+ source_type=source_type,
+ )
+ document_count = total
+ if search is not None or status is not None or source_type is not None:
+ document_count = await kb_helper.count_documents()
doc_list = [doc.model_dump() for doc in doc_list]
return (
Response()
- .ok({"items": doc_list, "page": page, "page_size": page_size})
+ .ok(
+ {
+ "items": doc_list,
+ "page": page,
+ "page_size": page_size,
+ "total": total,
+ "filtered_total": total,
+ "document_count": document_count,
+ },
+ )
.__dict__
)
@@ -683,9 +1446,9 @@ async def upload_document(self):
kb_id = None
chunk_size = None
chunk_overlap = None
- batch_size = 32
- tasks_limit = 3
- max_retries = 3
+ batch_size = None
+ tasks_limit = None
+ max_retries = None
files_to_upload = [] # 存储待上传的文件信息列表
if content_type and "multipart/form-data" not in content_type:
@@ -696,11 +1459,46 @@ async def upload_document(self):
files = await request.files
kb_id = form_data.get("kb_id")
- chunk_size = int(form_data.get("chunk_size", 512))
- chunk_overlap = int(form_data.get("chunk_overlap", 50))
- batch_size = int(form_data.get("batch_size", 32))
- tasks_limit = int(form_data.get("tasks_limit", 3))
- max_retries = int(form_data.get("max_retries", 3))
+ chunk_size = self._coerce_optional_int(
+ form_data.get("chunk_size"),
+ "chunk_size",
+ )
+ chunk_overlap = self._coerce_optional_int(
+ form_data.get("chunk_overlap"),
+ "chunk_overlap",
+ )
+ batch_size = self._coerce_optional_int(
+ form_data.get("batch_size"),
+ "batch_size",
+ )
+ tasks_limit = self._coerce_optional_int(
+ form_data.get("tasks_limit"),
+ "tasks_limit",
+ )
+ max_retries = self._coerce_optional_int(
+ form_data.get("max_retries"),
+ "max_retries",
+ )
+ chunk_size = chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE
+ chunk_overlap = (
+ chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
+ )
+ batch_size = (
+ batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE
+ )
+ tasks_limit = (
+ tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT
+ )
+ max_retries = (
+ max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES
+ )
+ self._validate_upload_options(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ )
if not kb_id:
return Response().error("缺少参数 kb_id").__dict__
@@ -716,8 +1514,10 @@ async def upload_document(self):
return Response().error("缺少文件").__dict__
# 限制文件数量
- if len(file_list) > 10:
- return Response().error("最多只能上传10个文件").__dict__
+ if len(file_list) > MAX_UPLOAD_FILES:
+ return (
+ Response().error(f"最多只能上传{MAX_UPLOAD_FILES}个文件").__dict__
+ )
# 处理每个文件
for file in file_list:
@@ -739,6 +1539,7 @@ async def upload_document(self):
file_type = (
file_name.rsplit(".", 1)[-1].lower() if "." in file_name else ""
)
+ self._validate_upload_file(file_name, len(file_content))
files_to_upload.append(
{
@@ -762,6 +1563,20 @@ async def upload_document(self):
# 初始化任务状态
self._init_task(task_id, status="pending")
+ await self._create_persistent_task(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type="upload",
+ status="pending",
+ progress={
+ "status": "pending",
+ "file_index": 0,
+ "file_total": len(files_to_upload),
+ "stage": "waiting",
+ "current": 0,
+ "total": 100,
+ },
+ )
# 启动后台任务
asyncio.create_task(
@@ -815,9 +1630,20 @@ def _validate_import_request(self, data: dict):
):
raise ValueError("chunks 必须是非空字符串列表")
- batch_size = data.get("batch_size", 32)
- tasks_limit = data.get("tasks_limit", 3)
- max_retries = data.get("max_retries", 3)
+ batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size")
+ tasks_limit = self._coerce_optional_int(data.get("tasks_limit"), "tasks_limit")
+ max_retries = self._coerce_optional_int(data.get("max_retries"), "max_retries")
+ batch_size = batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE
+ tasks_limit = (
+ tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT
+ )
+ max_retries = (
+ max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES
+ )
+ self._validate_positive_int(batch_size, "batch_size")
+ self._validate_positive_int(tasks_limit, "tasks_limit")
+ if max_retries < 0:
+ raise ValueError("max_retries 不能为负数")
return kb_id, documents, batch_size, tasks_limit, max_retries
async def import_documents(self):
@@ -851,6 +1677,20 @@ async def import_documents(self):
# 初始化任务状态
self._init_task(task_id, status="pending")
+ await self._create_persistent_task(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type="import",
+ status="pending",
+ progress={
+ "status": "pending",
+ "file_index": 0,
+ "file_total": len(documents),
+ "stage": "waiting",
+ "current": 0,
+ "total": 100,
+ },
+ )
# 启动后台任务
asyncio.create_task(
@@ -893,6 +1733,7 @@ async def get_upload_progress(self):
- pending: 任务待处理
- processing: 任务处理中
- completed: 任务完成
+ - partial_failed: 任务部分失败
- failed: 任务失败
"""
try:
@@ -902,7 +1743,27 @@ async def get_upload_progress(self):
# 检查任务是否存在
if task_id not in self.upload_tasks:
- return Response().error("找不到该任务").__dict__
+ persistent_task = await self._get_persistent_task(task_id)
+ if persistent_task is None:
+ return Response().error("找不到该任务").__dict__
+ response_data = {
+ "task_id": task_id,
+ "status": persistent_task["status"],
+ "progress_stage": persistent_task.get("progress_stage"),
+ "progress_current": persistent_task.get("progress_current", 0),
+ "progress_total": persistent_task.get("progress_total", 100),
+ }
+ if persistent_task.get("progress") is not None:
+ response_data["progress"] = persistent_task["progress"]
+ if persistent_task["status"] in (
+ "completed",
+ "partial_failed",
+ "failed",
+ ):
+ response_data["result"] = persistent_task.get("result")
+ if persistent_task["status"] in ("partial_failed", "failed"):
+ response_data["error"] = persistent_task.get("error")
+ return Response().ok(response_data).__dict__
task_info = self.upload_tasks[task_id]
status = task_info["status"]
@@ -918,17 +1779,17 @@ async def get_upload_progress(self):
response_data["progress"] = self.upload_progress[task_id]
# 如果任务完成,返回结果
- if status == "completed":
+ if status in ("completed", "partial_failed", "failed"):
response_data["result"] = task_info["result"]
- # 清理已完成的任务
- # del self.upload_tasks[task_id]
- # if task_id in self.upload_progress:
- # del self.upload_progress[task_id]
- # 如果任务失败,返回错误信息
- if status == "failed":
+ # 如果任务存在失败项,返回错误信息
+ if status in ("partial_failed", "failed"):
response_data["error"] = task_info["error"]
+ # 清理已结束的任务,释放内存
+ if status in ("completed", "partial_failed", "failed"):
+ self._cleanup_task(task_id)
+
return Response().ok(response_data).__dict__
except Exception as e:
@@ -936,6 +1797,69 @@ async def get_upload_progress(self):
logger.error(traceback.format_exc())
return Response().error(f"获取上传进度失败: {e!s}").__dict__
+ async def get_task(self):
+ """获取知识库持久任务详情"""
+ try:
+ task_id = request.args.get("task_id")
+ if not task_id:
+ return Response().error("缺少参数 task_id").__dict__
+
+ task = await self._get_persistent_task(task_id)
+ if not task:
+ return Response().error("任务不存在").__dict__
+ return Response().ok(task).__dict__
+
+ except Exception as e:
+ logger.error(f"获取知识库任务失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"获取知识库任务失败: {e!s}").__dict__
+
+ async def list_tasks(self):
+ """列出知识库持久任务"""
+ try:
+ kb_db = self._get_kb_db()
+ if not kb_db:
+ return Response().error("知识库数据库未初始化").__dict__
+
+ page = self._get_positive_query_int("page", 1)
+ page_size = self._get_positive_query_int(
+ "page_size",
+ DEFAULT_DOCUMENT_PAGE_SIZE,
+ )
+ kb_id = (request.args.get("kb_id") or "").strip() or None
+ status = (request.args.get("status") or "").strip() or None
+ task_type = (request.args.get("task_type") or "").strip() or None
+
+ tasks = await kb_db.list_ingestion_tasks(
+ kb_id=kb_id,
+ status=status,
+ task_type=task_type,
+ offset=(page - 1) * page_size,
+ limit=page_size,
+ )
+ total = await kb_db.count_ingestion_tasks(
+ kb_id=kb_id,
+ status=status,
+ task_type=task_type,
+ )
+ return (
+ Response()
+ .ok(
+ {
+ "items": tasks,
+ "total": total,
+ "page": page,
+ "page_size": page_size,
+ },
+ )
+ .__dict__
+ )
+
+ except Exception as e:
+ logger.error(f"获取知识库任务列表失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"获取知识库任务列表失败: {e!s}").__dict__
+
async def get_document(self):
"""获取文档详情
@@ -999,6 +1923,425 @@ async def delete_document(self):
logger.error(traceback.format_exc())
return Response().error(f"删除文档失败: {e!s}").__dict__
+ async def rebuild_document(self):
+ """重建单个文档"""
+ try:
+ kb_manager = self._get_kb_manager()
+ data = await request.json
+
+ kb_id = data.get("kb_id")
+ if not kb_id:
+ return Response().error("缺少参数 kb_id").__dict__
+ doc_id = data.get("doc_id")
+ if not doc_id:
+ return Response().error("缺少参数 doc_id").__dict__
+
+ chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size")
+ chunk_overlap = self._coerce_optional_int(
+ data.get("chunk_overlap"),
+ "chunk_overlap",
+ )
+ batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size")
+ tasks_limit = self._coerce_optional_int(
+ data.get("tasks_limit"),
+ "tasks_limit",
+ )
+ max_retries = self._coerce_optional_int(
+ data.get("max_retries"),
+ "max_retries",
+ )
+ effective_chunk_size = (
+ chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE
+ )
+ effective_chunk_overlap = (
+ chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
+ )
+ effective_batch_size = (
+ batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE
+ )
+ effective_tasks_limit = (
+ tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT
+ )
+ effective_max_retries = (
+ max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES
+ )
+ self._validate_upload_options(
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ )
+ background = self._coerce_optional_bool(
+ data.get("background"),
+ "background",
+ )
+
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+
+ if background:
+ task_id = str(uuid.uuid4())
+ self._init_task(task_id, status="pending")
+ await self._create_persistent_task(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type="document_rebuild",
+ status="pending",
+ progress={
+ "status": "pending",
+ "file_index": 0,
+ "file_total": 1,
+ "file_name": doc_id,
+ "stage": "waiting",
+ "current": 0,
+ "total": 100,
+ },
+ )
+ asyncio.create_task(
+ self._background_rebuild_document_task(
+ task_id=task_id,
+ kb_helper=kb_helper,
+ doc_id=doc_id,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ ),
+ )
+ return (
+ Response()
+ .ok(
+ {
+ "task_id": task_id,
+ "doc_id": doc_id,
+ "message": (
+ "document rebuild task created, "
+ "processing in background"
+ ),
+ },
+ )
+ .__dict__
+ )
+
+ doc = await kb_helper.rebuild_document(
+ doc_id,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ )
+ return Response().ok(doc.model_dump(), "重建文档成功").__dict__
+
+ except ValueError as e:
+ return Response().error(str(e)).__dict__
+ except Exception as e:
+ logger.error(f"重建文档失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"重建文档失败: {e!s}").__dict__
+
+ async def rebuild_kb(self):
+ """重建整个知识库"""
+ try:
+ kb_manager = self._get_kb_manager()
+ data = await request.json
+
+ kb_id = data.get("kb_id")
+ if not kb_id:
+ return Response().error("缺少参数 kb_id").__dict__
+
+ chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size")
+ chunk_overlap = self._coerce_optional_int(
+ data.get("chunk_overlap"),
+ "chunk_overlap",
+ )
+ batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size")
+ tasks_limit = self._coerce_optional_int(
+ data.get("tasks_limit"),
+ "tasks_limit",
+ )
+ max_retries = self._coerce_optional_int(
+ data.get("max_retries"),
+ "max_retries",
+ )
+ effective_chunk_size = (
+ chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE
+ )
+ effective_chunk_overlap = (
+ chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
+ )
+ effective_batch_size = (
+ batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE
+ )
+ effective_tasks_limit = (
+ tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT
+ )
+ effective_max_retries = (
+ max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES
+ )
+ self._validate_upload_options(
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ )
+ background = self._coerce_optional_bool(
+ data.get("background"),
+ "background",
+ )
+
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+
+ if background:
+ kb_name = getattr(
+ getattr(kb_helper, "kb", None),
+ "kb_name",
+ "knowledge base",
+ )
+ task_id = str(uuid.uuid4())
+ self._init_task(task_id, status="pending")
+ await self._create_persistent_task(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type="kb_rebuild",
+ status="pending",
+ progress={
+ "status": "pending",
+ "file_index": 0,
+ "file_total": 1,
+ "file_name": kb_name,
+ "stage": "waiting",
+ "current": 0,
+ "total": 100,
+ },
+ )
+ asyncio.create_task(
+ self._background_rebuild_kb_task(
+ task_id=task_id,
+ kb_helper=kb_helper,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ ),
+ )
+ return (
+ Response()
+ .ok(
+ {
+ "task_id": task_id,
+ "kb_id": kb_id,
+ "message": (
+ "knowledge base rebuild task created, "
+ "processing in background"
+ ),
+ },
+ )
+ .__dict__
+ )
+
+ result = await kb_helper.rebuild_all_documents(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ )
+ return Response().ok(result, "重建知识库完成").__dict__
+
+ except ValueError as e:
+ return Response().error(str(e)).__dict__
+ except Exception as e:
+ logger.error(f"重建知识库失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"重建知识库失败: {e!s}").__dict__
+
+ async def batch_rebuild_documents(self):
+ """Start a background task to rebuild selected documents.
+
+ Body:
+ - kb_id: knowledge base ID (required)
+ - doc_ids: document ID list (required)
+ """
+ try:
+ kb_manager = self._get_kb_manager()
+ data = await request.json
+
+ kb_id = data.get("kb_id")
+ if not kb_id:
+ return Response().error("缺少参数 kb_id").__dict__
+ doc_ids = data.get("doc_ids")
+ if not doc_ids or not isinstance(doc_ids, list):
+ return Response().error("缺少参数 doc_ids 或格式错误").__dict__
+ normalized_doc_ids = list(
+ dict.fromkeys(
+ doc_id.strip()
+ for doc_id in doc_ids
+ if isinstance(doc_id, str) and doc_id.strip()
+ )
+ )
+ if not normalized_doc_ids:
+ return Response().error("缺少参数 doc_ids 或格式错误").__dict__
+ if len(normalized_doc_ids) > MAX_BATCH_REBUILD_DOCUMENTS:
+ return (
+ Response()
+ .error(f"最多只能批量重建 {MAX_BATCH_REBUILD_DOCUMENTS} 个文档")
+ .__dict__
+ )
+
+ chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size")
+ chunk_overlap = self._coerce_optional_int(
+ data.get("chunk_overlap"),
+ "chunk_overlap",
+ )
+ batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size")
+ tasks_limit = self._coerce_optional_int(
+ data.get("tasks_limit"),
+ "tasks_limit",
+ )
+ max_retries = self._coerce_optional_int(
+ data.get("max_retries"),
+ "max_retries",
+ )
+ effective_chunk_size = (
+ chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE
+ )
+ effective_chunk_overlap = (
+ chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
+ )
+ effective_batch_size = (
+ batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE
+ )
+ effective_tasks_limit = (
+ tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT
+ )
+ effective_max_retries = (
+ max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES
+ )
+ self._validate_upload_options(
+ chunk_size=effective_chunk_size,
+ chunk_overlap=effective_chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ )
+
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+
+ task_id = str(uuid.uuid4())
+ self._init_task(task_id, status="pending")
+ await self._create_persistent_task(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type="document_batch_rebuild",
+ status="pending",
+ progress={
+ "status": "pending",
+ "file_index": 0,
+ "file_total": len(normalized_doc_ids),
+ "file_name": f"{len(normalized_doc_ids)} selected documents",
+ "stage": "waiting",
+ "current": 0,
+ "total": len(normalized_doc_ids),
+ },
+ )
+ asyncio.create_task(
+ self._background_rebuild_documents_task(
+ task_id=task_id,
+ kb_helper=kb_helper,
+ doc_ids=normalized_doc_ids,
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=effective_batch_size,
+ tasks_limit=effective_tasks_limit,
+ max_retries=effective_max_retries,
+ ),
+ )
+ return (
+ Response()
+ .ok(
+ {
+ "task_id": task_id,
+ "doc_ids": normalized_doc_ids,
+ "message": (
+ "document batch rebuild task created, "
+ "processing in background"
+ ),
+ },
+ )
+ .__dict__
+ )
+
+ except ValueError as e:
+ return Response().error(str(e)).__dict__
+ except Exception as e:
+ logger.error(f"批量重建文档失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"批量重建文档失败: {e!s}").__dict__
+
+ async def batch_delete_documents(self):
+ """批量删除文档
+
+ Body:
+ - kb_id: 知识库 ID (必填)
+ - doc_ids: 文档 ID 列表 (必填, 最多 100 个)
+ """
+ try:
+ kb_manager = self._get_kb_manager()
+ data = await request.json
+
+ kb_id = data.get("kb_id")
+ if not kb_id:
+ return Response().error("缺少参数 kb_id").__dict__
+ doc_ids = data.get("doc_ids")
+ if not doc_ids or not isinstance(doc_ids, list):
+ return Response().error("缺少参数 doc_ids 或格式错误").__dict__
+ if len(doc_ids) > MAX_BATCH_DELETE_DOCUMENTS:
+ return (
+ Response()
+ .error(f"最多只能批量删除 {MAX_BATCH_DELETE_DOCUMENTS} 个文档")
+ .__dict__
+ )
+
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+
+ results = await kb_helper.delete_documents(doc_ids)
+
+ success_count = sum(1 for v in results.values() if v)
+ failed_count = len(doc_ids) - success_count
+
+ return (
+ Response()
+ .ok(
+ {
+ "results": results,
+ "total": len(doc_ids),
+ "success_count": success_count,
+ "failed_count": failed_count,
+ },
+ "批量删除完成",
+ )
+ .__dict__
+ )
+
+ except ValueError as e:
+ return Response().error(str(e)).__dict__
+ except Exception as e:
+ logger.error(f"批量删除文档失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"批量删除文档失败: {e!s}").__dict__
+
async def delete_chunk(self):
"""删除文本块
@@ -1040,14 +2383,18 @@ async def list_chunks(self):
Query 参数:
- kb_id: 知识库 ID (必填)
- page: 页码 (默认 1)
- - page_size: 每页数量 (默认 20)
+ - page_size: 每页数量
"""
try:
kb_manager = self._get_kb_manager()
kb_id = request.args.get("kb_id")
doc_id = request.args.get("doc_id")
- page = request.args.get("page", 1, type=int)
- page_size = request.args.get("page_size", 100, type=int)
+ page = self._get_positive_query_int("page", 1)
+ page_size = self._get_positive_query_int(
+ "page_size",
+ DEFAULT_CHUNK_PAGE_SIZE,
+ )
+ search = (request.args.get("search") or "").strip() or None
if not kb_id:
return Response().error("缺少参数 kb_id").__dict__
if not doc_id:
@@ -1057,11 +2404,15 @@ async def list_chunks(self):
limit = page_size
if not kb_helper:
return Response().error("知识库不存在").__dict__
- chunk_list = await kb_helper.get_chunks_by_doc_id(
+ chunk_list, total = await kb_helper.search_chunks_by_doc_id(
doc_id=doc_id,
+ search=search,
offset=offset,
limit=limit,
)
+ document_chunk_count = total
+ if search is not None:
+ document_chunk_count = await kb_helper.get_chunk_count_by_doc_id(doc_id)
return (
Response()
.ok(
@@ -1069,7 +2420,9 @@ async def list_chunks(self):
"items": chunk_list,
"page": page,
"page_size": page_size,
- "total": await kb_helper.get_chunk_count_by_doc_id(doc_id),
+ "total": total,
+ "filtered_total": total,
+ "document_chunk_count": document_chunk_count,
},
)
.__dict__
@@ -1081,6 +2434,41 @@ async def list_chunks(self):
logger.error(traceback.format_exc())
return Response().error(f"获取块列表失败: {e!s}").__dict__
+ async def get_chunk_context(self):
+ """获取文本块和相邻上下文块
+
+ Query 参数:
+ - kb_id: 知识库 ID (必填)
+ - doc_id: 文档 ID (必填)
+ - chunk_id: 文本块 ID (必填)
+ """
+ try:
+ kb_manager = self._get_kb_manager()
+ kb_id = request.args.get("kb_id")
+ doc_id = request.args.get("doc_id")
+ chunk_id = request.args.get("chunk_id")
+ if not kb_id:
+ return Response().error("缺少参数 kb_id").__dict__
+ if not doc_id:
+ return Response().error("缺少参数 doc_id").__dict__
+ if not chunk_id:
+ return Response().error("缺少参数 chunk_id").__dict__
+
+ kb_helper = await kb_manager.get_kb(kb_id)
+ if not kb_helper:
+ return Response().error("知识库不存在").__dict__
+ context = await kb_helper.get_chunk_context(
+ chunk_id=chunk_id,
+ doc_id=doc_id,
+ )
+ return Response().ok(data=context).__dict__
+ except ValueError as e:
+ return Response().error(str(e)).__dict__
+ except Exception as e:
+ logger.error(f"获取文本块上下文失败: {e}")
+ logger.error(traceback.format_exc())
+ return Response().error(f"获取文本块上下文失败: {e!s}").__dict__
+
# ===== 检索 API =====
async def retrieve(self):
@@ -1097,20 +2485,35 @@ async def retrieve(self):
data = await request.json
query = data.get("query")
+ kb_ids = data.get("kb_ids")
kb_names = data.get("kb_names")
- debug = data.get("debug", False)
+ debug = self._coerce_optional_bool(data.get("debug", False), "debug")
+ trace = self._coerce_optional_bool(data.get("trace", False), "trace")
if not query:
return Response().error("缺少参数 query").__dict__
- if not kb_names or not isinstance(kb_names, list):
- return Response().error("缺少参数 kb_names 或格式错误").__dict__
-
- top_k = data.get("top_k", 5)
+ if kb_ids is not None and not isinstance(kb_ids, list):
+ return Response().error("参数 kb_ids 格式错误").__dict__
+ if kb_names is not None and not isinstance(kb_names, list):
+ return Response().error("参数 kb_names 格式错误").__dict__
+ if not kb_ids and not kb_names:
+ return Response().error("缺少参数 kb_ids 或 kb_names").__dict__
+
+ top_k = self._coerce_optional_int(
+ data.get("top_k", DEFAULT_TOP_M_FINAL),
+ "top_k",
+ )
+ top_k = top_k if top_k is not None else DEFAULT_TOP_M_FINAL
+ self._validate_positive_int(top_k, "top_k")
+ if top_k > MAX_RETRIEVE_TOP_K:
+ return Response().error(f"top_k 不能大于 {MAX_RETRIEVE_TOP_K}").__dict__
results = await kb_manager.retrieve(
query=query,
kb_names=kb_names,
+ kb_ids=kb_ids,
top_m_final=top_k,
+ include_trace=trace or debug,
)
result_list = []
if results:
@@ -1121,13 +2524,21 @@ async def retrieve(self):
"total": len(result_list),
"query": query,
}
+ if results and "trace" in results:
+ response_data["trace"] = results["trace"]
# Debug 模式:生成 t-SNE 可视化
if debug:
try:
+ visualization_kb_names = kb_names
+ if not visualization_kb_names and kb_ids:
+ visualization_kb_names = []
+ for kb_id in kb_ids:
+ if kb_helper := await kb_manager.get_kb(kb_id):
+ visualization_kb_names.append(kb_helper.kb.kb_name)
img_base64 = await generate_tsne_visualization(
query,
- kb_names,
+ visualization_kb_names or [],
kb_manager,
)
if img_base64:
@@ -1173,11 +2584,40 @@ async def upload_document_from_url(self):
if not url:
return Response().error("缺少参数 url").__dict__
- chunk_size = data.get("chunk_size", 512)
- chunk_overlap = data.get("chunk_overlap", 50)
- batch_size = data.get("batch_size", 32)
- tasks_limit = data.get("tasks_limit", 3)
- max_retries = data.get("max_retries", 3)
+ chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size")
+ chunk_overlap = self._coerce_optional_int(
+ data.get("chunk_overlap"),
+ "chunk_overlap",
+ )
+ batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size")
+ tasks_limit = self._coerce_optional_int(
+ data.get("tasks_limit"),
+ "tasks_limit",
+ )
+ max_retries = self._coerce_optional_int(
+ data.get("max_retries"),
+ "max_retries",
+ )
+ chunk_size = chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE
+ chunk_overlap = (
+ chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
+ )
+ batch_size = (
+ batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE
+ )
+ tasks_limit = (
+ tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT
+ )
+ max_retries = (
+ max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES
+ )
+ self._validate_upload_options(
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ batch_size=batch_size,
+ tasks_limit=tasks_limit,
+ max_retries=max_retries,
+ )
enable_cleaning = data.get("enable_cleaning", False)
cleaning_provider_id = data.get("cleaning_provider_id")
@@ -1191,6 +2631,21 @@ async def upload_document_from_url(self):
# 初始化任务状态
self._init_task(task_id, status="pending")
+ await self._create_persistent_task(
+ task_id=task_id,
+ kb_id=kb_id,
+ task_type="url",
+ status="pending",
+ progress={
+ "status": "pending",
+ "file_index": 0,
+ "file_total": 1,
+ "file_name": f"URL: {url}",
+ "stage": "waiting",
+ "current": 0,
+ "total": 100,
+ },
+ )
# 启动后台任务
asyncio.create_task(
@@ -1253,6 +2708,7 @@ async def _background_upload_from_url_task(
"current": 0,
"total": 100,
}
+ await self._persist_progress(task_id)
# 创建进度回调函数
progress_callback = self._make_progress_callback(task_id, 0, f"URL: {url}")
@@ -1281,8 +2737,23 @@ async def _background_upload_from_url_task(
}
self._set_task_result(task_id, "completed", result=result)
+ await self._update_persistent_task(
+ task_id,
+ status="completed",
+ result=result,
+ error=None,
+ **self._get_persistent_progress_updates(task_id),
+ )
except Exception as e:
logger.error(f"后台上传URL任务 {task_id} 失败: {e}")
logger.error(traceback.format_exc())
self._set_task_result(task_id, "failed", error=str(e))
+ await self._update_persistent_task(
+ task_id,
+ status="failed",
+ error=str(e),
+ **self._get_persistent_progress_updates(task_id),
+ )
+ finally:
+ asyncio.create_task(self._schedule_delayed_cleanup(task_id))
diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json
index 78a00669e3..d8df20cc68 100644
--- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json
+++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json
@@ -21,36 +21,183 @@
"stats": "Statistics",
"docCount": "Documents",
"chunkCount": "Chunks",
+ "readyDocCount": "Ready Documents",
+ "failedDocCount": "Failed Documents",
+ "sourceFiles": "Source Files",
+ "storageUsed": "Storage Used",
"embeddingModel": "Embedding Model",
"rerankModel": "Rerank Model",
"notSet": "Not Set"
},
+ "consistency": {
+ "title": "Index Consistency",
+ "run": "Run Check",
+ "repair": "Repair Fixable Issues",
+ "notRun": "No consistency check has been run yet. Run a check to compare document metadata, source files, and indexed chunks.",
+ "notRunHint": "A full check reads index metadata and lists fixable issues.",
+ "notRunChunkMismatch": "Current snapshot has {metadata} document chunks but {indexed} indexed chunks. Run a check.",
+ "notRunFailedDocs": "{count} documents are failed. Review the document list or run a consistency check.",
+ "healthy": "No consistency issues found",
+ "unhealthy": "{count} consistency issues found",
+ "checkedAt": "Checked at: {time}",
+ "sqliteDocuments": "Metadata Documents",
+ "indexedChunks": "Indexed Chunks",
+ "documentChunks": "Document Chunks",
+ "sourceFiles": "Source Files",
+ "expectedChunks": "{count} expected chunks",
+ "actualChunks": "{count} actual chunks",
+ "checkSuccessHealthy": "Consistency check completed with no issues",
+ "checkSuccessUnhealthy": "Consistency check completed with {count} issues",
+ "checkFailed": "Consistency check failed",
+ "repairSuccess": "Consistency repair completed: {repaired} repaired, {skipped} skipped",
+ "repairPartialSuccess": "Consistency repair partially completed: {repaired} repaired, {skipped} skipped, {failed} failed",
+ "repairFailed": "Consistency repair failed",
+ "issues": {
+ "missingVectors": "Documents Missing Indexed Chunks",
+ "orphanVectors": "Orphan Indexed Chunks",
+ "missingSourceFiles": "Missing Source Files",
+ "chunkCountMismatches": "Chunk Count Mismatches",
+ "invalidVectorMetadata": "Invalid Index Metadata",
+ "unsafeSourcePaths": "Unsafe Source Paths"
+ },
+ "reasons": {
+ "empty_file_path": "Source file path is empty",
+ "outside_kb_files_dir": "Source file path is outside the knowledge base directory",
+ "not_found": "Source file does not exist"
+ }
+ },
+ "maintenance": {
+ "rebuild": "Rebuild Index",
+ "rebuildStarted": "Knowledge base rebuild started",
+ "rebuildSuccess": "Knowledge base rebuild completed",
+ "rebuildFailed": "Failed to rebuild knowledge base",
+ "rebuildFailedWithReason": "Failed to rebuild knowledge base: {reason}",
+ "rebuildPartialSuccess": "Knowledge base rebuild partially completed: {success} succeeded, {failed} failed",
+ "unknownError": "Unknown error",
+ "stages": {
+ "waiting": "Waiting...",
+ "rebuilding": "Rebuilding knowledge base...",
+ "parsing": "Parsing document...",
+ "chunking": "Chunking text...",
+ "embedding": "Generating embeddings...",
+ "completed": "Completed"
+ }
+ },
+ "tasks": {
+ "title": "Recent Tasks",
+ "refresh": "Refresh tasks",
+ "empty": "No task records yet",
+ "loadFailed": "Failed to load recent tasks",
+ "recentFailures": "Recent failures",
+ "noErrorMessage": "No error message",
+ "resultSummary": "{total} total, {success} succeeded, {failed} failed",
+ "progressDetail": "Progress {progress}",
+ "types": {
+ "upload": "Document Upload",
+ "import": "Document Import",
+ "url": "URL Import",
+ "document_rebuild": "Document Rebuild",
+ "document_batch_rebuild": "Batch Document Rebuild",
+ "kb_rebuild": "Knowledge Base Rebuild"
+ },
+ "statuses": {
+ "pending": "Pending",
+ "processing": "Processing",
+ "completed": "Completed",
+ "partial_failed": "Partially failed",
+ "failed": "Failed"
+ }
+ },
"documents": {
"title": "Documents",
"upload": "Upload Document",
"empty": "No documents",
+ "searchPlaceholder": "Search documents...",
+ "statusFilter": "Status",
+ "sourceFilter": "Source",
+ "allStatuses": "All Statuses",
+ "allSources": "All Sources",
+ "filteredCount": "Showing {filtered} / {total} documents",
"name": "Name",
"type": "Type",
+ "status": "Status",
"size": "Size",
"chunks": "Chunks",
"createdAt": "Uploaded At",
"actions": "Actions",
"view": "View",
+ "copyFailure": "Copy Failure Diagnostics",
+ "rebuild": "Retry Rebuild",
"delete": "Delete",
+ "rebuildTitle": "Rebuild Document Index",
+ "rebuildConfirm": "Rebuild the index for document '{name}'?",
+ "rebuildWarning": "Rebuild will parse and write the index again. The previous index may still be used until the task finishes.",
+ "batchRebuild": "Rebuild Selected ({count})",
+ "batchRebuildTitle": "Rebuild Selected Documents",
+ "batchRebuildConfirm": "Rebuild the index for the {count} selected documents?",
+ "batchRebuildMore": "{count} more",
+ "batchRebuildWarning": "Batch rebuild will parse and write indexes for the selected documents again. Previous indexes may still be used until the task finishes.",
+ "batchDelete": "Delete Selected ({count})",
+ "batchDeleteTitle": "Delete Selected Documents",
+ "batchDeleteConfirm": "Delete the {count} selected documents?",
+ "batchDeleteMore": "{count} more",
+ "cancel": "Cancel",
"deleteConfirm": "Are you sure you want to delete document '{name}'?",
"deleteWarning": "This will delete the document and all its chunks. This action cannot be undone.",
"uploading": "Uploading...",
"uploadSuccess": "Document uploaded successfully",
"uploadFailed": "Failed to upload document",
+ "loadFailed": "Failed to load documents",
"deleteSuccess": "Document deleted successfully",
- "deleteFailed": "Failed to delete document"
+ "deleteFailed": "Failed to delete document",
+ "batchDeleteSuccess": "{count} documents deleted",
+ "batchDeletePartialSuccess": "Batch delete partially completed: {success} succeeded, {failed} failed",
+ "batchDeleteFailed": "Failed to batch delete documents",
+ "batchDeleteLimitExceeded": "You can delete up to {limit} documents at once",
+ "batchRebuildStarted": "Started rebuilding {count} documents",
+ "batchRebuildFailed": "Failed to batch rebuild documents",
+ "batchRebuildLimitExceeded": "You can rebuild up to {limit} documents at once",
+ "failureDocument": "Document",
+ "failureDocumentId": "Document ID",
+ "failureStage": "Failure Stage",
+ "failureMessage": "Error Message",
+ "unknownFailureStage": "Unknown Stage",
+ "noFailureMessage": "No error message",
+ "copyFailureSuccess": "Failure diagnostics copied",
+ "copyFailureFailed": "Failed to copy failure diagnostics",
+ "rebuildStarted": "Document rebuild started",
+ "rebuildSuccess": "Document rebuilt successfully",
+ "rebuildFailed": "Failed to rebuild document",
+ "rebuildFailedWithReason": "Failed to rebuild document: {reason}",
+ "rebuildPartialSuccess": "Document rebuild partially completed: {success} succeeded, {failed} failed",
+ "statuses": {
+ "pending": "Pending",
+ "parsing": "Parsing",
+ "chunking": "Chunking",
+ "embedding": "Indexing",
+ "ready": "Ready",
+ "failed": "Failed"
+ },
+ "sourceTypes": {
+ "file": "File",
+ "url": "URL",
+ "import": "Import"
+ }
},
"upload": {
"title": "Upload Document",
"selectFile": "Select File",
"dropzone": "Drop files here or click to select",
- "supportedFormats": "Supported formats: .txt, .md, .markdown, .rst, .adoc, .pdf, .docx, .epub, .xls, .xlsx",
- "maxSize": "Max file size: 128MB",
+ "supportedFormats": "Supported formats: {formats}",
+ "maxSize": "Max file size: {size}",
+ "maxFiles": "Upload up to {count} files",
+ "maxFilesWarning": "You can select up to {count} files",
+ "selectedFiles": "{count} files selected",
+ "clear": "Clear",
+ "someFilesRejected": "Some files were not added",
+ "unsupportedFile": "{name}: unsupported file type",
+ "fileTooLarge": "{name}: file exceeds {size}",
+ "invalidSettings": "Please check the upload settings",
"chunkSettings": "Chunk Settings",
"batchSettings": "Batch Settings",
"cleaningSettings": "Cleaning Settings",
@@ -58,15 +205,15 @@
"cleaningProvider": "Cleaning Service Provider",
"cleaningProviderHint": "Select an LLM provider to clean and summarize the extracted web page content",
"chunkSize": "Chunk Size",
- "chunkSizeHint": "Number of characters per chunk (default: 512)",
+ "chunkSizeHint": "Number of characters per chunk (default: {value})",
"chunkOverlap": "Chunk Overlap",
- "chunkOverlapHint": "Overlapping characters between chunks (default: 50)",
+ "chunkOverlapHint": "Overlapping characters between chunks (default: {value})",
"batchSize": "Batch Size",
- "batchSizeHint": "Number of chunks to process in each batch (default: 32)",
+ "batchSizeHint": "Number of chunks to process in each batch (default: {value})",
"tasksLimit": "Concurrent Tasks Limit",
- "tasksLimitHint": "Maximum number of concurrent upload tasks (default: 3)",
+ "tasksLimitHint": "Maximum number of concurrent upload tasks (default: {value})",
"maxRetries": "Max Retries",
- "maxRetriesHint": "Number of times to retry a failed upload task (default: 3)",
+ "maxRetriesHint": "Number of times to retry a failed upload task (default: {value})",
"cancel": "Cancel",
"submit": "Upload",
"fileRequired": "Please select a file to upload",
@@ -75,6 +222,27 @@
"urlPlaceholder": "Enter the URL of the web page to extract content from",
"urlRequired": "Please enter a URL",
"urlHint": "The main content will be automatically extracted from the target URL as a document. Currently supports {supported} pages. Before use, please ensure that the target web page allows crawler access.",
+ "unsupportedUrlImport": "URL import is not enabled by the backend",
+ "tavilyCheckFailed": "Failed to check web search configuration",
+ "tavilyRequired": "Tavily Key is required for this feature",
+ "configure": "Configure",
+ "tavilyConfigured": "Tavily API Key configured",
+ "backgroundUploading": "Uploading {count} files in the background...",
+ "backgroundUrlUploading": "Extracting URL content in the background...",
+ "successCount": "Successfully uploaded {count} documents",
+ "partialSuccess": "Upload finished: {success} succeeded, {failed} failed",
+ "failedWithReason": "Upload failed: {reason}",
+ "unknownError": "Unknown error",
+ "stages": {
+ "waiting": "Waiting...",
+ "extracting": "Extracting content...",
+ "cleaning": "Cleaning content...",
+ "parsing": "Parsing document...",
+ "chunking": "Chunking text...",
+ "embedding": "Generating embeddings...",
+ "rebuilding": "Rebuilding document...",
+ "completed": "Completed"
+ },
"beta": "Beta"
},
"retrieval": {
@@ -88,6 +256,14 @@
"noResults": "No results found",
"tryDifferentQuery": "Try a different query",
"settings": "Retrieval Settings",
+ "debugMode": "Debug Mode",
+ "debugModeTsne": "Debug Mode (t-SNE)",
+ "traceMode": "Retrieval Trace",
+ "cancel": "Cancel",
+ "caseNotesPlaceholder": "Example: sparse retrieval ranked too low",
+ "caseTags": "Tags",
+ "caseTagsPlaceholder": "Example: manual, retrieval-ui, bad-case",
+ "tsneVisualization": "t-SNE Visualization",
"topK": "Number of Results",
"topKHint": "Maximum number of results to return",
"enableRerank": "Enable Rerank",
@@ -97,9 +273,40 @@
"chunk": "Chunk #{index}",
"content": "Content",
"charCount": "{count} characters",
+ "traceTitle": "Retrieval Trace",
+ "traceStageCount": "{count} stages",
+ "traceHits": "{count} hits",
+ "traceDenseRank": "Dense rank #{rank}",
+ "traceSparseRank": "Sparse rank #{rank}",
+ "traceDenseScore": "Dense score",
+ "traceSparseScore": "Sparse score",
+ "traceRrfScore": "RRF score",
+ "traceRerankScore": "Rerank score",
+ "traceDuplicateOf": "Duplicate of {chunk}",
+ "traceDedupSimilarity": "Duplicate similarity {value}",
+ "sourcePage": "Page {page}",
+ "sourceSection": "Section {index}",
+ "sourceParentChunk": "Parent chunk {id}",
+ "tracePreviewEmpty": "No content preview",
+ "traceEmpty": "No candidates in this stage",
+ "unknownDocument": "Unknown document",
+ "traceStages": {
+ "dense": "Dense Recall",
+ "sparse": "Sparse Recall",
+ "fusion": "RRF Fusion",
+ "dedup": "Near-Duplicate Removal",
+ "dedup_removed": "Removed Duplicates",
+ "rerank": "Rerank",
+ "final": "Final Context"
+ },
"searchSuccess": "Search completed, found {count} results",
"searchFailed": "Search failed",
- "queryRequired": "Please enter a query"
+ "queryRequired": "Please enter a query",
+ "latestRunResults": "Latest Results",
+ "metricRecall": "Recall",
+ "metricNdcg": "nDCG",
+ "metricPrecision": "Precision",
+ "metricFirstHit": "First Hit"
},
"settings": {
"title": "Knowledge Base Settings",
@@ -113,9 +320,43 @@
"enableRerank": "Enable Rerank",
"embeddingProvider": "Embedding Provider",
"rerankProvider": "Rerank Provider",
+ "embeddingProviderHint": "The embedding model is bound to the current vector index. Create a new knowledge base to change it.",
+ "indexType": "Index Type",
+ "indexTypeHint": "Flat is exact; HNSW is better for larger knowledge bases.",
+ "indexTypes": {
+ "flat": "Flat exact index",
+ "hnsw": "HNSW approximate index"
+ },
"save": "Save Settings",
"saveSuccess": "Settings saved successfully",
"saveFailed": "Failed to save settings",
+ "providersLoadFailed": "Failed to load model providers",
"tips": "Tip: Modifying retrieval settings will affect subsequent knowledge base queries."
+ },
+ "validation": {
+ "integer": "Enter an integer",
+ "positiveInteger": "Enter an integer greater than 0",
+ "nonNegativeInteger": "Enter an integer no less than 0",
+ "overlapLessThanSize": "Chunk overlap must be less than chunk size",
+ "topKRange": "Number of results must be an integer from 1 to {max}"
+ },
+ "actions": {
+ "retry": "Retry"
+ },
+ "messages": {
+ "loadFailed": "Failed to load knowledge base details"
+ },
+ "tavily": {
+ "title": "Configure Tavily API Key",
+ "description": "A Tavily API Key is required to use web-based knowledge base features. You can get one from",
+ "officialSite": "Tavily",
+ "apiKeyLabel": "Tavily API Key",
+ "apiKeyPlaceholder": "tvly-...",
+ "cancel": "Cancel",
+ "save": "Save",
+ "keyRequired": "API Key is required",
+ "loadConfigFailed": "Failed to load current configuration",
+ "saveFailed": "Failed to save. Please check the key.",
+ "unknownSaveFailed": "Failed to save due to an unknown error"
}
}
diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json
index d3a3b65c9a..dbdca2bf67 100644
--- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json
+++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json
@@ -9,12 +9,48 @@
"chunkCount": "Chunk Count",
"createdAt": "Uploaded At"
},
+ "processing": {
+ "title": "Processing Information",
+ "status": "Status",
+ "sourceType": "Source Type",
+ "sourceUri": "Source URI",
+ "contentHash": "Content Hash",
+ "parser": "Parser",
+ "chunker": "Chunker",
+ "version": "Version",
+ "parentDocId": "Parent Document ID",
+ "indexedAt": "Indexed At",
+ "unknownStage": "Unknown Stage",
+ "noErrorMessage": "No error message",
+ "statuses": {
+ "pending": "Pending",
+ "parsing": "Parsing",
+ "chunking": "Chunking",
+ "embedding": "Indexing",
+ "ready": "Ready",
+ "failed": "Failed"
+ },
+ "sourceTypes": {
+ "file": "File",
+ "url": "URL",
+ "import": "Import",
+ "api": "API"
+ }
+ },
"chunks": {
"title": "Chunks",
+ "total": "{count} chunks",
+ "filteredTotal": "{filtered} / {total} matching chunks",
"empty": "No chunks",
"index": "Index",
"content": "Content",
+ "titlePath": "Title Path",
"charCount": "Characters",
+ "charCountValue": "{count} characters",
+ "tokenEstimate": "Estimated Tokens",
+ "tokenEstimateValue": "About {count} tokens",
+ "offset": "Offset",
+ "contentHash": "Content Hash",
"actions": "Actions",
"view": "View",
"edit": "Edit",
@@ -23,6 +59,7 @@
"search": "Search Chunks",
"searchPlaceholder": "Enter keywords to search chunks...",
"showing": "Showing",
+ "showingRange": "Showing {start} - {end} / {total} chunks",
"deleteConfirm": "Are you sure you want to delete this chunk?",
"deleteSuccess": "Chunk deleted successfully",
"deleteFailed": "Failed to delete chunk"
@@ -49,7 +86,39 @@
"index": "Index",
"content": "Content",
"charCount": "Characters",
+ "tokenEstimate": "Estimated Tokens",
+ "titlePath": "Title Path",
+ "section": "Section",
+ "pageNumber": "Page",
+ "offset": "Offset",
+ "contentHash": "Content Hash",
+ "adjacentChunks": "Adjacent Chunks",
+ "previousChunk": "Previous: {id}",
+ "nextChunk": "Next: {id}",
+ "parentChunk": "Parent Chunk",
"vecDocId": "Vector ID",
+ "context": "Adjacent Context",
+ "previous": "Previous",
+ "current": "Current",
+ "next": "Next",
+ "contextMissing": "No adjacent chunk",
"close": "Close"
+ },
+ "actions": {
+ "retry": "Retry",
+ "retryRebuild": "Retry Rebuild",
+ "retryRebuildConfirm": "Rebuild the index for this document?"
+ },
+ "messages": {
+ "loadDocumentFailed": "Failed to load document details",
+ "loadChunksFailed": "Failed to load chunks",
+ "loadChunkContextFailed": "Failed to load adjacent context",
+ "rebuildStarted": "Document rebuild started",
+ "rebuildCompleted": "Document rebuild completed",
+ "rebuildFailed": "Failed to rebuild document",
+ "rebuildFailedWithReason": "Failed to rebuild document: {reason}",
+ "focusChunkLoaded": "Opened the retrieved chunk",
+ "focusChunkFailed": "Failed to open the retrieved chunk",
+ "focusChunkNotFound": "Retrieved chunk not found"
}
}
diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json
index 67bb4d5717..960edf067c 100644
--- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json
+++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json
@@ -11,7 +11,9 @@
"documents": "Documents",
"chunks": "Chunks",
"sessionConfig": "Session Config",
- "initError": "Initialization Failed"
+ "initError": "Initialization Failed",
+ "noDescription": "No description",
+ "switchToLegacy": "Switch to legacy knowledge base"
},
"card": {
"edit": "Edit",
@@ -31,9 +33,12 @@
"rerankModelLabel": "Rerank Model (Optional)",
"providerInfo": "Provider: {id} | Dimensions: {dimensions}",
"rerankProviderInfo": "Provider: {id}",
+ "nameHint": "If you rename this knowledge base later, update any configuration that still references names.",
+ "embeddingModelHint": "The embedding model cannot be changed after creation. Create a new knowledge base to use another model.",
"cancel": "Cancel",
"submit": "Create",
- "nameRequired": "Please enter knowledge base name"
+ "nameRequired": "Please enter knowledge base name",
+ "embeddingRequired": "Please select an embedding model"
},
"edit": {
"title": "Edit Knowledge Base",
@@ -63,6 +68,7 @@
"updateFailed": "Failed to update",
"deleteSuccess": "Knowledge base deleted successfully",
"deleteFailed": "Failed to delete",
- "loadError": "Failed to load knowledge base list"
+ "loadError": "Failed to load knowledge base list",
+ "providersLoadError": "Failed to load model providers"
}
}
diff --git a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json
index 5145d5c285..d42d2a3034 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json
@@ -1,121 +1,362 @@
-{
- "title": "Детали базы знаний",
- "backToList": "К списку",
- "breadcrumb": {
- "list": "Базы знаний"
+{
+ "title": "Детали базы знаний",
+ "backToList": "К списку",
+ "breadcrumb": {
+ "list": "Базы знаний"
+ },
+ "tabs": {
+ "overview": "Обзор",
+ "documents": "Документы",
+ "retrieval": "Поиск",
+ "sessions": "Сессии",
+ "settings": "Настройки"
+ },
+ "overview": {
+ "title": "Информация",
+ "name": "Название",
+ "description": "Описание",
+ "emoji": "Иконка",
+ "createdAt": "Создана",
+ "updatedAt": "Обновлена",
+ "stats": "Статистика",
+ "docCount": "Количество документов",
+ "chunkCount": "Количество фрагментов",
+ "readyDocCount": "Готовые документы",
+ "failedDocCount": "Ошибки документов",
+ "sourceFiles": "Исходные файлы",
+ "storageUsed": "Занято места",
+ "embeddingModel": "Embedding модель",
+ "rerankModel": "Rerank модель",
+ "notSet": "не выбрано"
+ },
+ "consistency": {
+ "title": "Согласованность индекса",
+ "run": "Проверить",
+ "repair": "Исправить доступное",
+ "notRun": "Проверка еще не запускалась. Запустите ее, чтобы сравнить метаданные документов, исходные файлы и индексированные фрагменты.",
+ "notRunHint": "Полная проверка читает метаданные индекса и показывает проблемы, которые можно исправить.",
+ "notRunChunkMismatch": "В текущем снимке {metadata} фрагментов документов, а в индексе {indexed} фрагментов. Запустите проверку.",
+ "notRunFailedDocs": "Документов с ошибками: {count}. Проверьте список документов или запустите проверку согласованности.",
+ "healthy": "Проблем согласованности не найдено",
+ "unhealthy": "Найдено проблем: {count}",
+ "checkedAt": "Проверено: {time}",
+ "sqliteDocuments": "Документы в метаданных",
+ "indexedChunks": "Фрагменты в индексе",
+ "documentChunks": "Фрагменты документов",
+ "sourceFiles": "Исходные файлы",
+ "expectedChunks": "Ожидалось фрагментов: {count}",
+ "actualChunks": "Фактически фрагментов: {count}",
+ "checkSuccessHealthy": "Проверка завершена, проблем не найдено",
+ "checkSuccessUnhealthy": "Проверка завершена, найдено проблем: {count}",
+ "checkFailed": "Не удалось выполнить проверку",
+ "repairSuccess": "Исправление завершено: исправлено {repaired}, пропущено {skipped}",
+ "repairPartialSuccess": "Исправление частично завершено: исправлено {repaired}, пропущено {skipped}, ошибок {failed}",
+ "repairFailed": "Не удалось исправить согласованность",
+ "issues": {
+ "missingVectors": "У документов нет фрагментов в индексе",
+ "orphanVectors": "Фрагменты без документа",
+ "missingSourceFiles": "Нет исходных файлов",
+ "chunkCountMismatches": "Не совпадает число фрагментов",
+ "invalidVectorMetadata": "Ошибки метаданных индекса",
+ "unsafeSourcePaths": "Некорректные пути исходных файлов"
},
- "tabs": {
- "overview": "Обзор",
- "documents": "Документы",
- "retrieval": "Поиск",
- "sessions": "Сессии",
- "settings": "Настройки"
+ "reasons": {
+ "empty_file_path": "Путь к исходному файлу пуст",
+ "outside_kb_files_dir": "Путь к исходному файлу вне каталога базы знаний",
+ "not_found": "Исходный файл не найден"
+ }
+ },
+ "maintenance": {
+ "rebuild": "Переиндексировать",
+ "rebuildStarted": "Переиндексация базы знаний запущена",
+ "rebuildSuccess": "Переиндексация базы знаний завершена",
+ "rebuildFailed": "Не удалось переиндексировать базу знаний",
+ "rebuildFailedWithReason": "Не удалось переиндексировать базу знаний: {reason}",
+ "rebuildPartialSuccess": "Переиндексация частично завершена: успешно {success}, ошибок {failed}",
+ "unknownError": "Неизвестная ошибка",
+ "stages": {
+ "waiting": "Ожидание...",
+ "rebuilding": "Переиндексация базы знаний...",
+ "parsing": "Разбор документа...",
+ "chunking": "Разбиение текста...",
+ "embedding": "Генерация векторов...",
+ "completed": "Завершено"
+ }
+ },
+ "tasks": {
+ "title": "Последние задачи",
+ "refresh": "Обновить задачи",
+ "empty": "Задач пока нет",
+ "loadFailed": "Не удалось загрузить последние задачи",
+ "recentFailures": "Последние ошибки",
+ "noErrorMessage": "Нет сообщения об ошибке",
+ "resultSummary": "Всего {total}, успешно {success}, ошибок {failed}",
+ "progressDetail": "Прогресс {progress}",
+ "types": {
+ "upload": "Загрузка документа",
+ "import": "Импорт документа",
+ "url": "Импорт URL",
+ "document_rebuild": "Переиндексация документа",
+ "document_batch_rebuild": "Пакетная переиндексация документов",
+ "kb_rebuild": "Переиндексация базы знаний"
},
- "overview": {
- "title": "Информация",
- "name": "Название",
- "description": "Описание",
- "emoji": "Иконка",
- "createdAt": "Создана",
- "updatedAt": "Обновлена",
- "stats": "Статистика",
- "docCount": "Количество документов",
- "chunkCount": "Количество фрагментов",
- "embeddingModel": "Embedding модель",
- "rerankModel": "Rerank модель",
- "notSet": "не выбрано"
+ "statuses": {
+ "pending": "Ожидание",
+ "processing": "В обработке",
+ "completed": "Завершено",
+ "partial_failed": "Частичная ошибка",
+ "failed": "Ошибка"
+ }
+ },
+ "documents": {
+ "title": "Список документов",
+ "upload": "Загрузить",
+ "empty": "Документов нет",
+ "searchPlaceholder": "Поиск документов...",
+ "statusFilter": "Статус",
+ "sourceFilter": "Источник",
+ "allStatuses": "Все статусы",
+ "allSources": "Все источники",
+ "filteredCount": "Показано {filtered} / {total} документов",
+ "name": "Имя файла",
+ "type": "Тип",
+ "status": "Статус",
+ "size": "Размер",
+ "chunks": "Фрагменты",
+ "createdAt": "Дата загрузки",
+ "actions": "Действия",
+ "view": "Смотреть",
+ "copyFailure": "Копировать диагностику",
+ "rebuild": "Повторить индексацию",
+ "delete": "Удалить",
+ "rebuildTitle": "Переиндексировать документ",
+ "rebuildConfirm": "Переиндексировать документ «{name}»?",
+ "rebuildWarning": "Переиндексация повторно разберет документ и запишет индекс. До завершения задачи может использоваться прежний индекс.",
+ "batchRebuild": "Переиндексировать выбранные ({count})",
+ "batchRebuildTitle": "Переиндексировать выбранные документы",
+ "batchRebuildConfirm": "Переиндексировать выбранные документы: {count}?",
+ "batchRebuildMore": "Еще {count}",
+ "batchRebuildWarning": "Пакетная переиндексация повторно разберет выбранные документы и запишет индексы. До завершения задачи могут использоваться прежние индексы.",
+ "batchDelete": "Удалить выбранные ({count})",
+ "batchDeleteTitle": "Удалить выбранные документы",
+ "batchDeleteConfirm": "Удалить выбранные документы: {count}?",
+ "batchDeleteMore": "Еще {count}",
+ "cancel": "Отмена",
+ "deleteConfirm": "Вы уверены, что хотите удалить «{name}»?",
+ "deleteWarning": "Это удалит файл и все его фрагменты из индекса.",
+ "uploading": "Загрузка...",
+ "uploadSuccess": "Файл успешно загружен",
+ "uploadFailed": "Ошибка загрузки",
+ "loadFailed": "Не удалось загрузить документы",
+ "deleteSuccess": "Файл удален",
+ "deleteFailed": "Ошибка удаления",
+ "batchDeleteSuccess": "Удалено документов: {count}",
+ "batchDeletePartialSuccess": "Пакетное удаление частично завершено: успешно {success}, ошибок {failed}",
+ "batchDeleteFailed": "Не удалось удалить документы пакетом",
+ "batchDeleteLimitExceeded": "За один раз можно удалить не более {limit} документов",
+ "batchRebuildStarted": "Запущена переиндексация документов: {count}",
+ "batchRebuildFailed": "Не удалось переиндексировать документы пакетом",
+ "batchRebuildLimitExceeded": "За один раз можно переиндексировать не более {limit} документов",
+ "failureDocument": "Документ",
+ "failureDocumentId": "ID документа",
+ "failureStage": "Этап ошибки",
+ "failureMessage": "Сообщение ошибки",
+ "unknownFailureStage": "Неизвестный этап",
+ "noFailureMessage": "Нет сообщения об ошибке",
+ "copyFailureSuccess": "Диагностика ошибки скопирована",
+ "copyFailureFailed": "Не удалось скопировать диагностику ошибки",
+ "rebuildStarted": "Переиндексация документа запущена",
+ "rebuildSuccess": "Документ переиндексирован",
+ "rebuildFailed": "Не удалось переиндексировать документ",
+ "rebuildFailedWithReason": "Не удалось переиндексировать документ: {reason}",
+ "rebuildPartialSuccess": "Переиндексация частично завершена: успешно {success}, ошибок {failed}",
+ "statuses": {
+ "pending": "Ожидание",
+ "parsing": "Разбор",
+ "chunking": "Фрагментация",
+ "embedding": "Индексация",
+ "ready": "Готово",
+ "failed": "Ошибка"
},
- "documents": {
- "title": "Список документов",
- "upload": "Загрузить",
- "empty": "Документов нет",
- "name": "Имя файла",
- "type": "Тип",
- "size": "Размер",
- "chunks": "Фрагменты",
- "createdAt": "Дата загрузки",
- "actions": "Действия",
- "view": "Смотреть",
- "delete": "Удалить",
- "deleteConfirm": "Вы уверены, что хотите удалить «{name}»?",
- "deleteWarning": "Это удалит файл и все его фрагменты из индекса.",
- "uploading": "Загрузка...",
- "uploadSuccess": "Файл успешно загружен",
- "uploadFailed": "Ошибка загрузки",
- "deleteSuccess": "Файл удален",
- "deleteFailed": "Ошибка удаления"
+ "sourceTypes": {
+ "file": "Файл",
+ "url": "URL",
+ "import": "Импорт"
+ }
+ },
+ "upload": {
+ "title": "Добавление контента",
+ "selectFile": "Файл",
+ "dropzone": "Нажмите или перетащите файл сюда",
+ "supportedFormats": "Форматы: {formats}",
+ "maxSize": "Максимум: {size}",
+ "maxFiles": "Можно загрузить до {count} файлов",
+ "maxFilesWarning": "Можно выбрать не более {count} файлов",
+ "selectedFiles": "Выбрано файлов: {count}",
+ "clear": "Очистить",
+ "someFilesRejected": "Некоторые файлы не добавлены",
+ "unsupportedFile": "{name}: неподдерживаемый тип файла",
+ "fileTooLarge": "{name}: файл больше {size}",
+ "invalidSettings": "Проверьте параметры загрузки",
+ "chunkSettings": "Фрагментация",
+ "batchSettings": "Пакетная обработка",
+ "cleaningSettings": "Очистка данных",
+ "enableCleaning": "Включить очистку контента",
+ "cleaningProvider": "Сервис для очистки",
+ "cleaningProviderHint": "LLM провайдер для суммаризации и извлечения смыслов из веб-страниц",
+ "chunkSize": "Размер чанка",
+ "chunkSizeHint": "Символов в блоке (по умолчанию: {value})",
+ "chunkOverlap": "Перекрытие",
+ "chunkOverlapHint": "Перекрытие между блоками (по умолчанию: {value})",
+ "batchSize": "Размер пакета",
+ "batchSizeHint": "Блоков за один запрос (по умолчанию: {value})",
+ "tasksLimit": "Лимит задач",
+ "tasksLimitHint": "Макс. параллельных потоков (по умолчанию: {value})",
+ "maxRetries": "Попытки",
+ "maxRetriesHint": "Повторов при сбое (по умолчанию: {value})",
+ "cancel": "Отмена",
+ "submit": "Загрузить",
+ "fileRequired": "Пожалуйста, выберите файл",
+ "fileUpload": "Загрузка файла",
+ "fromUrl": "Из URL",
+ "urlPlaceholder": "Ссылка на веб-страницу",
+ "urlRequired": "Введите URL",
+ "urlHint": "Контент будет автоматически извлечен со страницы. Убедитесь, что сайт разрешает доступ роботам.",
+ "unsupportedUrlImport": "Импорт из URL не включен на сервере",
+ "tavilyCheckFailed": "Не удалось проверить настройки веб-поиска",
+ "tavilyRequired": "Для этой функции нужен Tavily Key",
+ "configure": "Настроить",
+ "tavilyConfigured": "Tavily API Key сохранен",
+ "backgroundUploading": "Фоновая загрузка файлов: {count}...",
+ "backgroundUrlUploading": "Фоновое извлечение контента из URL...",
+ "successCount": "Успешно загружено документов: {count}",
+ "partialSuccess": "Загрузка завершена: успешно {success}, ошибок {failed}",
+ "failedWithReason": "Ошибка загрузки: {reason}",
+ "unknownError": "Неизвестная ошибка",
+ "stages": {
+ "waiting": "Ожидание...",
+ "extracting": "Извлечение контента...",
+ "cleaning": "Очистка контента...",
+ "parsing": "Разбор документа...",
+ "chunking": "Разбиение текста...",
+ "embedding": "Генерация векторов...",
+ "rebuilding": "Переиндексация документа...",
+ "completed": "Завершено"
},
- "upload": {
- "title": "Добавление контента",
- "selectFile": "Файл",
- "dropzone": "Нажмите или перетащите файл сюда",
- "supportedFormats": "Форматы: .txt, .md, .markdown, .rst, .adoc, .pdf, .docx, .epub, .xls, .xlsx",
- "maxSize": "Максимум: 128MB",
- "chunkSettings": "Фрагментация",
- "batchSettings": "Пакетная обработка",
- "cleaningSettings": "Очистка данных",
- "enableCleaning": "Включить очистку контента",
- "cleaningProvider": "Сервис для очистки",
- "cleaningProviderHint": "LLM провайдер для суммаризации и извлечения смыслов из веб-страниц",
- "chunkSize": "Размер чанка",
- "chunkSizeHint": "Символов в блоке (по умолчанию: 512)",
- "chunkOverlap": "Перекрытие",
- "chunkOverlapHint": "Перекрытие между блоками (по умолчанию: 50)",
- "batchSize": "Размер пакета",
- "batchSizeHint": "Блоков за один запрос (по умолчанию: 32)",
- "tasksLimit": "Лимит задач",
- "tasksLimitHint": "Макс. параллельных потоков (по умолчанию: 3)",
- "maxRetries": "Попытки",
- "maxRetriesHint": "Повторов при сбое (по умолчанию: 3)",
- "cancel": "Отмена",
- "submit": "Загрузить",
- "fileRequired": "Пожалуйста, выберите файл",
- "fileUpload": "Загрузка файла",
- "fromUrl": "Из URL",
- "urlPlaceholder": "Ссылка на веб-страницу",
- "urlRequired": "Введите URL",
- "urlHint": "Контент будет автоматически извлечен со страницы. Убедитесь, что сайт разрешает доступ роботам.",
- "beta": "Бета-версия"
+ "beta": "Бета-версия"
+ },
+ "retrieval": {
+ "title": "Поиск и проверка",
+ "subtitle": "Проверьте качество поиска (Dense & Sparse) по вашей базе знаний",
+ "query": "Тестовый запрос",
+ "queryPlaceholder": "Что вы хотите найти?",
+ "search": "Найти",
+ "searching": "Ищем...",
+ "results": "Результаты поиска",
+ "noResults": "Релевантный контент не найден",
+ "tryDifferentQuery": "Попробуйте изменить формулировку запроса",
+ "settings": "Параметры поиска",
+ "debugMode": "Режим отладки",
+ "debugModeTsne": "Режим отладки (t-SNE)",
+ "traceMode": "Трассировка поиска",
+ "cancel": "Отмена",
+ "caseNotesPlaceholder": "Например: Sparse поиск дал низкий ранг",
+ "caseTags": "Теги",
+ "caseTagsPlaceholder": "Например: manual, retrieval-ui, bad-case",
+ "tsneVisualization": "t-SNE визуализация",
+ "topK": "Количество результатов",
+ "topKHint": "Сколько фрагментов возвращать",
+ "enableRerank": "Включить Rerank",
+ "enableRerankHint": "Применить переранжирование для повышения точности",
+ "score": "Вес (Score)",
+ "document": "Документ",
+ "chunk": "Фрагмент #{index}",
+ "content": "Текст",
+ "charCount": "{count} симв.",
+ "traceTitle": "Трассировка поиска",
+ "traceStageCount": "Этапов: {count}",
+ "traceHits": "Найдено: {count}",
+ "traceDenseRank": "Dense ранг #{rank}",
+ "traceSparseRank": "Sparse ранг #{rank}",
+ "traceDenseScore": "Оценка dense",
+ "traceSparseScore": "Оценка sparse",
+ "traceRrfScore": "Оценка RRF",
+ "traceRerankScore": "Оценка rerank",
+ "traceDuplicateOf": "Дубликат {chunk}",
+ "traceDedupSimilarity": "Сходство дубля {value}",
+ "sourcePage": "Стр. {page}",
+ "sourceSection": "Раздел {index}",
+ "sourceParentChunk": "Родительский фрагмент {id}",
+ "tracePreviewEmpty": "Нет предпросмотра",
+ "traceEmpty": "На этом этапе нет кандидатов",
+ "unknownDocument": "Неизвестный документ",
+ "traceStages": {
+ "dense": "Dense поиск",
+ "sparse": "Sparse поиск",
+ "fusion": "RRF объединение",
+ "dedup": "Удаление дублей",
+ "dedup_removed": "Удаленные дубли",
+ "rerank": "Rerank",
+ "final": "Итоговый контекст"
},
- "retrieval": {
- "title": "Поиск и проверка",
- "subtitle": "Проверьте качество поиска (Dense & Sparse) по вашей базе знаний",
- "query": "Тестовый запрос",
- "queryPlaceholder": "Что вы хотите найти?",
- "search": "Найти",
- "searching": "Ищем...",
- "results": "Результаты поиска",
- "noResults": "Релевантный контент не найден",
- "tryDifferentQuery": "Попробуйте изменить формулировку запроса",
- "settings": "Параметры поиска",
- "topK": "Количество результатов",
- "topKHint": "Сколько фрагментов возвращать",
- "enableRerank": "Включить Rerank",
- "enableRerankHint": "Применить переранжирование для повышения точности",
- "score": "Вес (Score)",
- "document": "Документ",
- "chunk": "Фрагмент #{index}",
- "content": "Текст",
- "charCount": "{count} симв.",
- "searchSuccess": "Поиск завершен, найдено: {count}",
- "searchFailed": "Ошибка выполнения поиска",
- "queryRequired": "Введите поисковый запрос"
+ "searchSuccess": "Поиск завершен, найдено: {count}",
+ "searchFailed": "Ошибка выполнения поиска",
+ "queryRequired": "Введите поисковый запрос",
+ "latestRunResults": "Последние результаты",
+ "metricRecall": "Recall",
+ "metricNdcg": "nDCG",
+ "metricPrecision": "Precision",
+ "metricFirstHit": "Первое попадание"
+ },
+ "settings": {
+ "title": "Общие настройки базы",
+ "basic": "Основные",
+ "retrieval": "Поиск",
+ "chunkSize": "Размер чанка",
+ "chunkOverlap": "Перекрытие",
+ "topKDense": "Вернуть (Dense)",
+ "topKSparse": "Вернуть (Sparse)",
+ "topMFinal": "Итоговый результат",
+ "enableRerank": "Включить Rerank",
+ "embeddingProvider": "Провайдер Embedding",
+ "rerankProvider": "Провайдер Rerank",
+ "embeddingProviderHint": "Embedding модель связана с текущим векторным индексом. Для смены создайте новую базу знаний.",
+ "indexType": "Тип индекса",
+ "indexTypeHint": "Flat точнее, HNSW лучше для больших баз знаний.",
+ "indexTypes": {
+ "flat": "Flat точный индекс",
+ "hnsw": "HNSW приближенный индекс"
},
- "settings": {
- "title": "Общие настройки базы",
- "basic": "Основные",
- "retrieval": "Поиск",
- "chunkSize": "Размер чанка",
- "chunkOverlap": "Перекрытие",
- "topKDense": "Вернуть (Dense)",
- "topKSparse": "Вернуть (Sparse)",
- "topMFinal": "Итоговый результат",
- "enableRerank": "Включить Rerank",
- "embeddingProvider": "Провайдер Embedding",
- "rerankProvider": "Провайдер Rerank",
- "save": "Сохранить",
- "saveSuccess": "Настройки сохранены",
- "saveFailed": "Ошибка сохранения",
- "tips": "Внимание! Изменение этих параметров повлияет на будущую выдачу базы знаний."
- }
+ "save": "Сохранить",
+ "saveSuccess": "Настройки сохранены",
+ "saveFailed": "Ошибка сохранения",
+ "providersLoadFailed": "Не удалось загрузить провайдеры моделей",
+ "tips": "Внимание! Изменение этих параметров повлияет на будущую выдачу базы знаний."
+ },
+ "validation": {
+ "integer": "Введите целое число",
+ "positiveInteger": "Введите целое число больше 0",
+ "nonNegativeInteger": "Введите целое число не меньше 0",
+ "overlapLessThanSize": "Перекрытие должно быть меньше размера чанка",
+ "topKRange": "Количество результатов должно быть целым числом от 1 до {max}"
+ },
+ "actions": {
+ "retry": "Повторить"
+ },
+ "messages": {
+ "loadFailed": "Не удалось загрузить детали базы знаний"
+ },
+ "tavily": {
+ "title": "Настройка Tavily API Key",
+ "description": "Для веб-функций базы знаний нужен Tavily API Key. Получить его можно на",
+ "officialSite": "сайте Tavily",
+ "apiKeyLabel": "Tavily API Key",
+ "apiKeyPlaceholder": "tvly-...",
+ "cancel": "Отмена",
+ "save": "Сохранить",
+ "keyRequired": "API Key обязателен",
+ "loadConfigFailed": "Не удалось загрузить текущую конфигурацию",
+ "saveFailed": "Не удалось сохранить. Проверьте ключ.",
+ "unknownSaveFailed": "Не удалось сохранить из-за неизвестной ошибки"
+ }
}
diff --git a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json
index 7fcb30ee9f..4f391e4e93 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json
@@ -1,55 +1,124 @@
{
- "title": "Просмотр документа",
- "backToKB": "К базе знаний",
- "info": {
- "title": "Информация о документе",
- "name": "Имя файла",
- "type": "Формат",
- "size": "Размер",
- "chunkCount": "Количество фрагментов",
- "createdAt": "Загружен"
+ "title": "Просмотр документа",
+ "backToKB": "К базе знаний",
+ "info": {
+ "title": "Информация о документе",
+ "name": "Имя файла",
+ "type": "Формат",
+ "size": "Размер",
+ "chunkCount": "Количество фрагментов",
+ "createdAt": "Загружен"
+ },
+ "processing": {
+ "title": "Информация обработки",
+ "status": "Статус",
+ "sourceType": "Тип источника",
+ "sourceUri": "Источник",
+ "contentHash": "Хэш контента",
+ "parser": "Парсер",
+ "chunker": "Разбиение",
+ "version": "Версия",
+ "parentDocId": "ID родительского документа",
+ "indexedAt": "Индексирован",
+ "unknownStage": "Неизвестный этап",
+ "noErrorMessage": "Нет сообщения об ошибке",
+ "statuses": {
+ "pending": "Ожидание",
+ "parsing": "Разбор",
+ "chunking": "Фрагментация",
+ "embedding": "Индексация",
+ "ready": "Готово",
+ "failed": "Ошибка"
},
- "chunks": {
- "title": "Фрагменты текста",
- "empty": "Фрагменты не найдены",
- "index": "Индекс",
- "content": "Текст",
- "charCount": "Символов",
- "actions": "Действия",
- "view": "Детали",
- "edit": "Изменить",
- "delete": "Удалить",
- "preview": "Обзор",
- "search": "Поиск по документу",
- "searchPlaceholder": "Найти во фрагментах...",
- "showing": "Показано",
- "deleteConfirm": "Удалить этот фрагмент?",
- "deleteSuccess": "Фрагмент удален",
- "deleteFailed": "Ошибка удаления"
- },
- "edit": {
- "title": "Редактирование фрагмента",
- "content": "Текст",
- "cancel": "Отмена",
- "save": "Сохранить",
- "saveSuccess": "Фрагмент обновлен",
- "saveFailed": "Ошибка сохранения"
- },
- "delete": {
- "title": "Удаление",
- "confirmText": "Вы уверены?",
- "warning": "Удаление фрагмента может ухудшить качество ответов AI по этой теме.",
- "cancel": "Отмена",
- "confirm": "Удалить",
- "deleteSuccess": "Удаление выполнено",
- "deleteFailed": "Ошибка удаления"
- },
- "view": {
- "title": "Детальный просмотр",
- "index": "Индекс",
- "content": "Текст",
- "charCount": "Символов",
- "vecDocId": "ID вектора",
- "close": "Закрыть"
+ "sourceTypes": {
+ "file": "Файл",
+ "url": "URL",
+ "import": "Импорт",
+ "api": "API"
}
-}
\ No newline at end of file
+ },
+ "chunks": {
+ "title": "Фрагменты текста",
+ "total": "Фрагментов: {count}",
+ "filteredTotal": "Найдено {filtered} / {total} фрагм.",
+ "empty": "Фрагменты не найдены",
+ "index": "Индекс",
+ "content": "Текст",
+ "titlePath": "Путь заголовков",
+ "charCount": "Символов",
+ "charCountValue": "{count} симв.",
+ "tokenEstimate": "Оценка токенов",
+ "tokenEstimateValue": "Около {count} ток.",
+ "offset": "Позиция",
+ "contentHash": "Хэш контента",
+ "actions": "Действия",
+ "view": "Детали",
+ "edit": "Изменить",
+ "delete": "Удалить",
+ "preview": "Обзор",
+ "search": "Поиск по документу",
+ "searchPlaceholder": "Найти во фрагментах...",
+ "showing": "Показано",
+ "showingRange": "Показано {start} - {end} / {total} фрагм.",
+ "deleteConfirm": "Удалить этот фрагмент?",
+ "deleteSuccess": "Фрагмент удален",
+ "deleteFailed": "Ошибка удаления"
+ },
+ "edit": {
+ "title": "Редактирование фрагмента",
+ "content": "Текст",
+ "cancel": "Отмена",
+ "save": "Сохранить",
+ "saveSuccess": "Фрагмент обновлен",
+ "saveFailed": "Ошибка сохранения"
+ },
+ "delete": {
+ "title": "Удаление",
+ "confirmText": "Вы уверены?",
+ "warning": "Удаление фрагмента может ухудшить качество ответов AI по этой теме.",
+ "cancel": "Отмена",
+ "confirm": "Удалить",
+ "deleteSuccess": "Удаление выполнено",
+ "deleteFailed": "Ошибка удаления"
+ },
+ "view": {
+ "title": "Детальный просмотр",
+ "index": "Индекс",
+ "content": "Текст",
+ "charCount": "Символов",
+ "tokenEstimate": "Оценка токенов",
+ "titlePath": "Путь заголовков",
+ "section": "Раздел",
+ "pageNumber": "Страница",
+ "offset": "Позиция",
+ "contentHash": "Хэш контента",
+ "adjacentChunks": "Соседние фрагменты",
+ "previousChunk": "Предыдущий: {id}",
+ "nextChunk": "Следующий: {id}",
+ "parentChunk": "Родительский фрагмент",
+ "vecDocId": "ID вектора",
+ "context": "Соседний контекст",
+ "previous": "Предыдущий",
+ "current": "Текущий",
+ "next": "Следующий",
+ "contextMissing": "Соседний фрагмент отсутствует",
+ "close": "Закрыть"
+ },
+ "actions": {
+ "retry": "Повторить",
+ "retryRebuild": "Повторить индексацию",
+ "retryRebuildConfirm": "Переиндексировать этот документ?"
+ },
+ "messages": {
+ "loadDocumentFailed": "Не удалось загрузить документ",
+ "loadChunksFailed": "Не удалось загрузить фрагменты",
+ "loadChunkContextFailed": "Не удалось загрузить соседний контекст",
+ "rebuildStarted": "Переиндексация документа запущена",
+ "rebuildCompleted": "Переиндексация документа завершена",
+ "rebuildFailed": "Не удалось переиндексировать документ",
+ "rebuildFailedWithReason": "Не удалось переиндексировать документ: {reason}",
+ "focusChunkLoaded": "Открыт найденный фрагмент",
+ "focusChunkFailed": "Не удалось открыть найденный фрагмент",
+ "focusChunkNotFound": "Найденный фрагмент не найден"
+ }
+}
diff --git a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json
index 4eb99d5f06..ca7f5e26ed 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json
@@ -1,68 +1,74 @@
{
- "title": "Управление базами знаний",
- "subtitle": "Централизованное управление всеми знаниями AstrBot",
- "list": {
- "title": "Базы знаний",
- "subtitle": "Все доступные коллекции знаний",
- "create": "Создать базу",
- "refresh": "Обновить",
- "empty": "Баз знаний пока нет",
- "loading": "Загрузка...",
- "documents": "док.",
- "chunks": "фрагм.",
- "sessionConfig": "Профиль",
- "initError": "Ошибка инициализации"
- },
- "card": {
- "edit": "Изменить",
- "delete": "Удалить",
- "open": "Открыть",
- "docCount": "Документов: {count}",
- "chunkCount": "Фрагментов: {count}"
- },
- "create": {
- "title": "Создание базы знаний",
- "nameLabel": "Название",
- "namePlaceholder": "Придумайте имя для базы",
- "descriptionLabel": "Описание",
- "descriptionPlaceholder": "Для чего нужна эта база?",
- "emojiLabel": "Иконка",
- "embeddingModelLabel": "Embedding модель",
- "rerankModelLabel": "Rerank модель (опционально)",
- "providerInfo": "Провайдер: {id} | Размерность: {dimensions}",
- "rerankProviderInfo": "Провайдер: {id}",
- "cancel": "Отмена",
- "submit": "Создать",
- "nameRequired": "Введите название базы знаний"
- },
- "edit": {
- "title": "Редактирование",
- "submit": "Сохранить"
- },
- "delete": {
- "title": "Удаление",
- "confirmText": "Вы уверены, что хотите удалить базу знаний «{name}»?",
- "warning": "Это действие необратимо. Все документы, фрагменты и настройки будут навсегда удалены.",
- "cancel": "Отмена",
- "confirm": "Удалить"
- },
- "emoji": {
- "title": "Выберите иконку",
- "close": "Закрыть",
- "categories": {
- "books": "Книги и документы",
- "emotions": "Эмоции",
- "objects": "Вещи",
- "symbols": "Символы"
- }
- },
- "messages": {
- "createSuccess": "База знаний создана",
- "createFailed": "Ошибка создания",
- "updateSuccess": "Обновлено успешно",
- "updateFailed": "Ошибка обновления",
- "deleteSuccess": "Удалено успешно",
- "deleteFailed": "Ошибка удаления",
- "loadError": "Не удалось загрузить список"
+ "title": "Управление базами знаний",
+ "subtitle": "Централизованное управление всеми знаниями AstrBot",
+ "list": {
+ "title": "Базы знаний",
+ "subtitle": "Все доступные коллекции знаний",
+ "create": "Создать базу",
+ "refresh": "Обновить",
+ "empty": "Баз знаний пока нет",
+ "loading": "Загрузка...",
+ "documents": "док.",
+ "chunks": "фрагм.",
+ "sessionConfig": "Профиль",
+ "initError": "Ошибка инициализации",
+ "noDescription": "Нет описания",
+ "switchToLegacy": "Перейти к старой базе знаний"
+ },
+ "card": {
+ "edit": "Изменить",
+ "delete": "Удалить",
+ "open": "Открыть",
+ "docCount": "Документов: {count}",
+ "chunkCount": "Фрагментов: {count}"
+ },
+ "create": {
+ "title": "Создание базы знаний",
+ "nameLabel": "Название",
+ "namePlaceholder": "Придумайте имя для базы",
+ "descriptionLabel": "Описание",
+ "descriptionPlaceholder": "Для чего нужна эта база?",
+ "emojiLabel": "Иконка",
+ "embeddingModelLabel": "Embedding модель",
+ "rerankModelLabel": "Rerank модель (опционально)",
+ "providerInfo": "Провайдер: {id} | Размерность: {dimensions}",
+ "rerankProviderInfo": "Провайдер: {id}",
+ "nameHint": "Если позже переименуете базу, обновите конфигурации, где она указана по имени.",
+ "embeddingModelHint": "Embedding модель нельзя изменить после создания. Для другой модели создайте новую базу.",
+ "cancel": "Отмена",
+ "submit": "Создать",
+ "nameRequired": "Введите название базы знаний",
+ "embeddingRequired": "Выберите embedding модель"
+ },
+ "edit": {
+ "title": "Редактирование",
+ "submit": "Сохранить"
+ },
+ "delete": {
+ "title": "Удаление",
+ "confirmText": "Вы уверены, что хотите удалить базу знаний «{name}»?",
+ "warning": "Это действие необратимо. Все документы, фрагменты и настройки будут навсегда удалены.",
+ "cancel": "Отмена",
+ "confirm": "Удалить"
+ },
+ "emoji": {
+ "title": "Выберите иконку",
+ "close": "Закрыть",
+ "categories": {
+ "books": "Книги и документы",
+ "emotions": "Эмоции",
+ "objects": "Вещи",
+ "symbols": "Символы"
}
+ },
+ "messages": {
+ "createSuccess": "База знаний создана",
+ "createFailed": "Ошибка создания",
+ "updateSuccess": "Обновлено успешно",
+ "updateFailed": "Ошибка обновления",
+ "deleteSuccess": "Удалено успешно",
+ "deleteFailed": "Ошибка удаления",
+ "loadError": "Не удалось загрузить список",
+ "providersLoadError": "Не удалось загрузить провайдеры моделей"
+ }
}
diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json
index 54bc60b7a7..987e91fa18 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json
@@ -21,36 +21,183 @@
"stats": "统计信息",
"docCount": "文档数量",
"chunkCount": "分块数量",
+ "readyDocCount": "已索引文档",
+ "failedDocCount": "失败文档",
+ "sourceFiles": "源文件",
+ "storageUsed": "存储占用",
"embeddingModel": "嵌入模型",
"rerankModel": "重排序模型",
"notSet": "未设置"
},
+ "consistency": {
+ "title": "索引一致性",
+ "run": "运行检查",
+ "repair": "修复可修复项",
+ "notRun": "尚未运行一致性检查。点击运行检查可诊断文档元数据、源文件和索引文本块是否一致。",
+ "notRunHint": "完整检查会读取索引元数据,并列出可修复项。",
+ "notRunChunkMismatch": "当前快照显示文档记录有 {metadata} 个分块,索引中有 {indexed} 个分块,建议运行检查。",
+ "notRunFailedDocs": "当前有 {count} 个失败文档,建议查看文档列表或运行一致性检查。",
+ "healthy": "未发现一致性问题",
+ "unhealthy": "发现 {count} 个一致性问题",
+ "checkedAt": "检查时间: {time}",
+ "sqliteDocuments": "元数据文档",
+ "indexedChunks": "索引分块",
+ "documentChunks": "文档分块",
+ "sourceFiles": "源文件",
+ "expectedChunks": "预期 {count} 个分块",
+ "actualChunks": "实际 {count} 个分块",
+ "checkSuccessHealthy": "一致性检查完成,未发现问题",
+ "checkSuccessUnhealthy": "一致性检查完成,发现 {count} 个问题",
+ "checkFailed": "一致性检查失败",
+ "repairSuccess": "一致性修复完成: 修复 {repaired} 项, 跳过 {skipped} 项",
+ "repairPartialSuccess": "一致性修复部分完成: 修复 {repaired} 项, 跳过 {skipped} 项, 失败 {failed} 项",
+ "repairFailed": "一致性修复失败",
+ "issues": {
+ "missingVectors": "文档缺失索引分块",
+ "orphanVectors": "孤儿索引分块",
+ "missingSourceFiles": "源文件缺失",
+ "chunkCountMismatches": "分块数量不一致",
+ "invalidVectorMetadata": "索引元数据异常",
+ "unsafeSourcePaths": "源文件路径异常"
+ },
+ "reasons": {
+ "empty_file_path": "源文件路径为空",
+ "outside_kb_files_dir": "源文件路径不在知识库目录内",
+ "not_found": "源文件不存在"
+ }
+ },
+ "maintenance": {
+ "rebuild": "重建索引",
+ "rebuildStarted": "知识库重建任务已开始",
+ "rebuildSuccess": "知识库重建完成",
+ "rebuildFailed": "知识库重建失败",
+ "rebuildFailedWithReason": "知识库重建失败: {reason}",
+ "rebuildPartialSuccess": "知识库重建部分完成: 成功 {success} 个, 失败 {failed} 个",
+ "unknownError": "未知错误",
+ "stages": {
+ "waiting": "等待中...",
+ "rebuilding": "重建知识库...",
+ "parsing": "解析文档...",
+ "chunking": "文本分块...",
+ "embedding": "生成向量...",
+ "completed": "已完成"
+ }
+ },
+ "tasks": {
+ "title": "最近任务",
+ "refresh": "刷新任务",
+ "empty": "暂无任务记录",
+ "loadFailed": "加载最近任务失败",
+ "recentFailures": "最近失败",
+ "noErrorMessage": "暂无错误信息",
+ "resultSummary": "共 {total} 个,成功 {success} 个,失败 {failed} 个",
+ "progressDetail": "进度 {progress}",
+ "types": {
+ "upload": "上传文档",
+ "import": "导入文档",
+ "url": "URL 导入",
+ "document_rebuild": "文档重建",
+ "document_batch_rebuild": "批量文档重建",
+ "kb_rebuild": "知识库重建"
+ },
+ "statuses": {
+ "pending": "等待中",
+ "processing": "处理中",
+ "completed": "已完成",
+ "partial_failed": "部分失败",
+ "failed": "失败"
+ }
+ },
"documents": {
"title": "文档列表",
"upload": "上传文档",
"empty": "暂无文档",
+ "searchPlaceholder": "搜索文档...",
+ "statusFilter": "状态",
+ "sourceFilter": "来源",
+ "allStatuses": "全部状态",
+ "allSources": "全部来源",
+ "filteredCount": "显示 {filtered} / {total} 个文档",
"name": "文档名称",
"type": "类型",
+ "status": "状态",
"size": "大小",
"chunks": "分块数",
"createdAt": "上传时间",
"actions": "操作",
"view": "查看",
+ "copyFailure": "复制失败诊断",
+ "rebuild": "重试重建",
"delete": "删除",
+ "rebuildTitle": "重建文档索引",
+ "rebuildConfirm": "确定要重新构建文档「{name}」的索引吗?",
+ "rebuildWarning": "重建会重新解析并写入索引。任务完成前,旧索引仍可能被检索到。",
+ "batchRebuild": "批量重建 ({count})",
+ "batchRebuildTitle": "批量重建文档索引",
+ "batchRebuildConfirm": "确定要重新构建选中的 {count} 个文档索引吗?",
+ "batchRebuildMore": "还有 {count} 个",
+ "batchRebuildWarning": "批量重建会为选中文档重新解析并写入索引。任务完成前,旧索引仍可能被检索到。",
+ "batchDelete": "批量删除 ({count})",
+ "batchDeleteTitle": "批量删除文档",
+ "batchDeleteConfirm": "确定要删除选中的 {count} 个文档吗?",
+ "batchDeleteMore": "还有 {count} 个",
+ "cancel": "取消",
"deleteConfirm": "确定要删除文档「{name}」吗?",
"deleteWarning": "此操作将删除文档及其所有分块,不可恢复。",
"uploading": "正在上传...",
"uploadSuccess": "文档上传成功",
"uploadFailed": "文档上传失败",
+ "loadFailed": "加载文档列表失败",
"deleteSuccess": "文档删除成功",
- "deleteFailed": "文档删除失败"
+ "deleteFailed": "文档删除失败",
+ "batchDeleteSuccess": "已删除 {count} 个文档",
+ "batchDeletePartialSuccess": "批量删除部分完成: 成功 {success} 个, 失败 {failed} 个",
+ "batchDeleteFailed": "批量删除文档失败",
+ "batchDeleteLimitExceeded": "单次最多只能删除 {limit} 个文档",
+ "batchRebuildStarted": "已开始重建 {count} 个文档",
+ "batchRebuildFailed": "批量重建文档失败",
+ "batchRebuildLimitExceeded": "单次最多只能重建 {limit} 个文档",
+ "failureDocument": "文档",
+ "failureDocumentId": "文档 ID",
+ "failureStage": "失败阶段",
+ "failureMessage": "错误信息",
+ "unknownFailureStage": "未知阶段",
+ "noFailureMessage": "暂无错误信息",
+ "copyFailureSuccess": "已复制失败诊断信息",
+ "copyFailureFailed": "复制失败诊断信息失败",
+ "rebuildStarted": "文档重建任务已开始",
+ "rebuildSuccess": "文档重建成功",
+ "rebuildFailed": "文档重建失败",
+ "rebuildFailedWithReason": "文档重建失败: {reason}",
+ "rebuildPartialSuccess": "文档重建部分成功: 成功 {success} 个, 失败 {failed} 个",
+ "statuses": {
+ "pending": "等待中",
+ "parsing": "解析中",
+ "chunking": "分块中",
+ "embedding": "索引中",
+ "ready": "已索引",
+ "failed": "失败"
+ },
+ "sourceTypes": {
+ "file": "文件",
+ "url": "URL",
+ "import": "导入"
+ }
},
"upload": {
"title": "上传文档",
"selectFile": "选择文件",
"dropzone": "拖放文件到这里或点击选择",
- "supportedFormats": "支持的格式: .txt, .md, .markdown, .rst, .adoc, .pdf, .docx, .epub, .xls, .xlsx",
- "maxSize": "最大文件大小: 128MB",
+ "supportedFormats": "支持的格式: {formats}",
+ "maxSize": "最大文件大小: {size}",
+ "maxFiles": "最多可上传 {count} 个文件",
+ "maxFilesWarning": "最多只能选择 {count} 个文件",
+ "selectedFiles": "已选择 {count} 个文件",
+ "clear": "清空",
+ "someFilesRejected": "部分文件未加入上传队列",
+ "unsupportedFile": "{name}: 不支持的文件类型",
+ "fileTooLarge": "{name}: 文件超过 {size}",
+ "invalidSettings": "请检查上传参数",
"chunkSettings": "分块设置",
"batchSettings": "批处理设置",
"cleaningSettings": "清洗设置",
@@ -58,15 +205,15 @@
"cleaningProvider": "清洗服务提供商",
"cleaningProviderHint": "选择一个 LLM 服务商来对提取的网页内容进行清洗和总结",
"chunkSize": "分块大小",
- "chunkSizeHint": "每个文本块的字符数 (默认: 512)",
+ "chunkSizeHint": "每个文本块的字符数 (默认: {value})",
"chunkOverlap": "分块重叠",
- "chunkOverlapHint": "相邻文本块之间的重叠字符数 (默认: 50)",
+ "chunkOverlapHint": "相邻文本块之间的重叠字符数 (默认: {value})",
"batchSize": "批处理大小",
- "batchSizeHint": "每批处理的文本块数量 (默认: 32)",
+ "batchSizeHint": "每批处理的文本块数量 (默认: {value})",
"tasksLimit": "并发任务限制",
- "tasksLimitHint": "最大并发上传任务数 (默认: 3)",
+ "tasksLimitHint": "最大并发上传任务数 (默认: {value})",
"maxRetries": "最大重试次数",
- "maxRetriesHint": "上传失败任务的重试次数 (默认: 3)",
+ "maxRetriesHint": "上传失败任务的重试次数 (默认: {value})",
"cancel": "取消",
"submit": "上传",
"fileRequired": "请选择要上传的文件",
@@ -75,6 +222,27 @@
"urlPlaceholder": "请输入要提取内容的网页 URL",
"urlRequired": "请输入 URL",
"urlHint": "将自动从目标 URL 提取主要内容作为文档。目前支持 {supported} 页面,请确保目标网页允许爬虫访问。",
+ "unsupportedUrlImport": "当前后端未启用 URL 导入功能",
+ "tavilyCheckFailed": "检查网页搜索配置失败",
+ "tavilyRequired": "使用此功能需要配置 Tavily Key",
+ "configure": "配置",
+ "tavilyConfigured": "Tavily API Key 配置成功",
+ "backgroundUploading": "正在后台上传 {count} 个文件...",
+ "backgroundUrlUploading": "正在从 URL 后台提取内容...",
+ "successCount": "成功上传 {count} 个文档",
+ "partialSuccess": "上传完成: {success} 个成功, {failed} 个失败",
+ "failedWithReason": "上传失败: {reason}",
+ "unknownError": "未知错误",
+ "stages": {
+ "waiting": "等待中...",
+ "extracting": "提取内容...",
+ "cleaning": "清洗内容...",
+ "parsing": "解析文档...",
+ "chunking": "文本分块...",
+ "embedding": "生成向量...",
+ "rebuilding": "重建文档...",
+ "completed": "已完成"
+ },
"beta": "测试版"
},
"retrieval": {
@@ -88,6 +256,14 @@
"noResults": "没有找到相关内容",
"tryDifferentQuery": "尝试使用不同的查询词",
"settings": "检索设置",
+ "debugMode": "调试模式",
+ "debugModeTsne": "调试模式 (t-SNE)",
+ "traceMode": "检索链路追踪",
+ "cancel": "取消",
+ "caseNotesPlaceholder": "例如:稀疏检索排名偏低",
+ "caseTags": "标签",
+ "caseTagsPlaceholder": "例如:manual, retrieval-ui, bad-case",
+ "tsneVisualization": "t-SNE 可视化",
"topK": "返回结果数量",
"topKHint": "最多返回多少条检索结果",
"enableRerank": "启用重排序",
@@ -97,9 +273,40 @@
"chunk": "文本块 #{index}",
"content": "内容",
"charCount": "{count} 字符",
+ "traceTitle": "检索链路",
+ "traceStageCount": "{count} 个阶段",
+ "traceHits": "{count} 条",
+ "traceDenseRank": "稠密排名 #{rank}",
+ "traceSparseRank": "稀疏排名 #{rank}",
+ "traceDenseScore": "稠密分",
+ "traceSparseScore": "稀疏分",
+ "traceRrfScore": "RRF 分",
+ "traceRerankScore": "重排分",
+ "traceDuplicateOf": "重复于 {chunk}",
+ "traceDedupSimilarity": "重复相似度 {value}",
+ "sourcePage": "第 {page} 页",
+ "sourceSection": "章节 {index}",
+ "sourceParentChunk": "父文本块 {id}",
+ "tracePreviewEmpty": "暂无内容预览",
+ "traceEmpty": "该阶段没有候选结果",
+ "unknownDocument": "未知文档",
+ "traceStages": {
+ "dense": "稠密召回",
+ "sparse": "稀疏召回",
+ "fusion": "RRF 融合",
+ "dedup": "近重复去除",
+ "dedup_removed": "已移除重复项",
+ "rerank": "重排序",
+ "final": "最终上下文"
+ },
"searchSuccess": "检索完成,找到 {count} 条结果",
"searchFailed": "检索失败",
- "queryRequired": "请输入检索查询"
+ "queryRequired": "请输入检索查询",
+ "latestRunResults": "最近结果",
+ "metricRecall": "召回率",
+ "metricNdcg": "归一化折损累计增益 (nDCG)",
+ "metricPrecision": "精确率",
+ "metricFirstHit": "首个命中"
},
"settings": {
"title": "知识库设置",
@@ -113,9 +320,43 @@
"enableRerank": "启用重排序",
"embeddingProvider": "嵌入模型提供商",
"rerankProvider": "重排序模型提供商",
+ "embeddingProviderHint": "嵌入模型与现有向量索引绑定,如需更换请创建新的知识库。",
+ "indexType": "索引类型",
+ "indexTypeHint": "Flat 更精确,HNSW 更适合大规模知识库。",
+ "indexTypes": {
+ "flat": "Flat 精确索引",
+ "hnsw": "HNSW 近似索引"
+ },
"save": "保存设置",
"saveSuccess": "设置保存成功",
"saveFailed": "设置保存失败",
+ "providersLoadFailed": "加载模型提供商失败",
"tips": "提示: 修改检索设置后,将影响后续的知识库查询效果。"
+ },
+ "validation": {
+ "integer": "请输入整数",
+ "positiveInteger": "请输入大于 0 的整数",
+ "nonNegativeInteger": "请输入不小于 0 的整数",
+ "overlapLessThanSize": "分块重叠必须小于分块大小",
+ "topKRange": "返回结果数量必须是 1 到 {max} 的整数"
+ },
+ "actions": {
+ "retry": "重试"
+ },
+ "messages": {
+ "loadFailed": "加载知识库详情失败"
+ },
+ "tavily": {
+ "title": "配置 Tavily API Key",
+ "description": "为了使用基于网页的知识库功能,需要提供 Tavily API Key。您可以从",
+ "officialSite": "Tavily 官网",
+ "apiKeyLabel": "Tavily API Key",
+ "apiKeyPlaceholder": "tvly-...",
+ "cancel": "取消",
+ "save": "保存",
+ "keyRequired": "API Key 不能为空",
+ "loadConfigFailed": "获取当前配置失败",
+ "saveFailed": "保存失败,请检查 Key 是否正确",
+ "unknownSaveFailed": "保存失败,发生未知错误"
}
}
diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json
index c90c29cc29..6127213d92 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json
@@ -9,12 +9,48 @@
"chunkCount": "分块数量",
"createdAt": "上传时间"
},
+ "processing": {
+ "title": "处理信息",
+ "status": "状态",
+ "sourceType": "来源类型",
+ "sourceUri": "来源地址",
+ "contentHash": "内容哈希",
+ "parser": "解析器",
+ "chunker": "分块器",
+ "version": "版本",
+ "parentDocId": "父文档 ID",
+ "indexedAt": "索引时间",
+ "unknownStage": "未知阶段",
+ "noErrorMessage": "暂无错误信息",
+ "statuses": {
+ "pending": "等待中",
+ "parsing": "解析中",
+ "chunking": "分块中",
+ "embedding": "索引中",
+ "ready": "已索引",
+ "failed": "失败"
+ },
+ "sourceTypes": {
+ "file": "文件",
+ "url": "URL",
+ "import": "导入",
+ "api": "API"
+ }
+ },
"chunks": {
"title": "分块列表",
+ "total": "{count} 个分块",
+ "filteredTotal": "匹配 {filtered} / {total} 个分块",
"empty": "暂无分块",
"index": "序号",
"content": "内容",
+ "titlePath": "标题路径",
"charCount": "字符数",
+ "charCountValue": "{count} 字符",
+ "tokenEstimate": "估算 Token",
+ "tokenEstimateValue": "约 {count} token",
+ "offset": "位置",
+ "contentHash": "内容哈希",
"actions": "操作",
"view": "查看",
"edit": "编辑",
@@ -23,6 +59,7 @@
"search": "搜索分块",
"searchPlaceholder": "输入关键词搜索分块内容...",
"showing": "显示",
+ "showingRange": "显示 {start} - {end} / {total} 个分块",
"deleteConfirm": "确定要删除该文本块吗?",
"deleteSuccess": "文本块删除成功",
"deleteFailed": "文本块删除失败"
@@ -49,7 +86,39 @@
"index": "序号",
"content": "内容",
"charCount": "字符数",
+ "tokenEstimate": "估算 Token",
+ "titlePath": "标题路径",
+ "section": "章节",
+ "pageNumber": "页码",
+ "offset": "位置",
+ "contentHash": "内容哈希",
+ "adjacentChunks": "相邻分块",
+ "previousChunk": "上一块: {id}",
+ "nextChunk": "下一块: {id}",
+ "parentChunk": "父分块",
"vecDocId": "向量ID",
+ "context": "相邻上下文",
+ "previous": "上一块",
+ "current": "当前块",
+ "next": "下一块",
+ "contextMissing": "暂无相邻分块",
"close": "关闭"
+ },
+ "actions": {
+ "retry": "重试",
+ "retryRebuild": "重试重建",
+ "retryRebuildConfirm": "确定要重新构建该文档索引吗?"
+ },
+ "messages": {
+ "loadDocumentFailed": "加载文档详情失败",
+ "loadChunksFailed": "加载分块列表失败",
+ "loadChunkContextFailed": "加载相邻上下文失败",
+ "rebuildStarted": "文档重建任务已开始",
+ "rebuildCompleted": "文档重建完成",
+ "rebuildFailed": "文档重建失败",
+ "rebuildFailedWithReason": "文档重建失败: {reason}",
+ "focusChunkLoaded": "已打开检索命中的分块",
+ "focusChunkFailed": "打开检索命中的分块失败",
+ "focusChunkNotFound": "未找到检索命中的分块"
}
}
diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json
index cac88bacd1..87d74926db 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json
@@ -11,7 +11,9 @@
"documents": "文档",
"chunks": "分块",
"sessionConfig": "会话配置",
- "initError": "初始化失败"
+ "initError": "初始化失败",
+ "noDescription": "暂无描述",
+ "switchToLegacy": "切换到旧版知识库"
},
"card": {
"edit": "编辑",
@@ -27,13 +29,16 @@
"descriptionLabel": "描述",
"descriptionPlaceholder": "简单描述这个知识库的用途...",
"emojiLabel": "图标",
- "embeddingModelLabel": "嵌入模型 (Embedding Model)",
- "rerankModelLabel": "重排序模型 (Rerank Model, 可选)",
+ "embeddingModelLabel": "嵌入模型",
+ "rerankModelLabel": "重排序模型(可选)",
"providerInfo": "提供商: {id} | 维度: {dimensions}",
"rerankProviderInfo": "提供商: {id}",
+ "nameHint": "如果后续修改知识库名称,请同步更新仍按名称引用的配置。",
+ "embeddingModelHint": "嵌入模型选择后无法修改,如需更换请创建新的知识库。",
"cancel": "取消",
"submit": "创建",
- "nameRequired": "请输入知识库名称"
+ "nameRequired": "请输入知识库名称",
+ "embeddingRequired": "请选择嵌入模型"
},
"edit": {
"title": "编辑知识库",
@@ -63,6 +68,7 @@
"updateFailed": "更新失败",
"deleteSuccess": "知识库删除成功",
"deleteFailed": "删除失败",
- "loadError": "加载知识库列表失败"
+ "loadError": "加载知识库列表失败",
+ "providersLoadError": "加载模型提供商失败"
}
}
diff --git a/dashboard/src/main.ts b/dashboard/src/main.ts
index ce5514207c..eb2f15c205 100644
--- a/dashboard/src/main.ts
+++ b/dashboard/src/main.ts
@@ -2,7 +2,7 @@ import { createApp } from 'vue';
import { createPinia } from 'pinia';
import App from './App.vue';
import { router } from './router';
-import vuetify from './plugins/vuetify';
+import vuetify, { getVuetifyLocale } from './plugins/vuetify';
import confirmPlugin from './plugins/confirmPlugin';
import { setupI18n } from './i18n/composables';
import '@/scss/style.scss';
@@ -47,12 +47,18 @@ import { waitForRouterReadyInBackground } from './utils/routerReadiness.mjs';
},
};
+const syncVuetifyLocale = (event: Event) => {
+ const locale = (event as CustomEvent<{ locale?: string }>).detail?.locale;
+ vuetify.locale.current.value = getVuetifyLocale(locale);
+};
+
// 初始化新的i18n系统,等待完成后再挂载应用
setupI18n().then(async () => {
console.log('🌍 新i18n系统初始化完成');
-
+
const app = createApp(App);
const pinia = createPinia();
+ window.addEventListener('astrbot-locale-changed', syncVuetifyLocale);
app.use(pinia);
app.use(router);
app.use(print);
@@ -86,6 +92,7 @@ setupI18n().then(async () => {
// 即使i18n初始化失败,也要挂载应用(使用回退机制)
const app = createApp(App);
const pinia = createPinia();
+ window.addEventListener('astrbot-locale-changed', syncVuetifyLocale);
app.use(pinia);
app.use(router);
app.use(print);
diff --git a/dashboard/src/plugins/vuetify.ts b/dashboard/src/plugins/vuetify.ts
index e38fd388e6..474f1ca02c 100644
--- a/dashboard/src/plugins/vuetify.ts
+++ b/dashboard/src/plugins/vuetify.ts
@@ -1,32 +1,91 @@
import { createVuetify } from 'vuetify';
+import { en, ru, zhHans } from 'vuetify/locale';
import '@/assets/mdi-subset/materialdesignicons-subset.css';
import * as components from 'vuetify/components';
import * as directives from 'vuetify/directives';
import { PurpleTheme } from '@/theme/LightTheme';
-import { PurpleThemeDark } from "@/theme/DarkTheme";
+import { PurpleThemeDark } from '@/theme/DarkTheme';
+
+const zhHansMessages = {
+ ...zhHans,
+ open: '打开',
+ dismiss: '关闭',
+ dataFooter: {
+ ...zhHans.dataFooter,
+ itemsPerPageText: '每页条数:',
+ firstPage: '第一页',
+ lastPage: '最后一页',
+ },
+ input: {
+ ...zhHans.input,
+ clear: '清空 {0}',
+ prependAction: '{0} 前置操作',
+ appendAction: '{0} 后置操作',
+ otp: '请输入第 {0} 位验证码',
+ },
+ pagination: {
+ ...zhHans.pagination,
+ ariaLabel: {
+ ...zhHans.pagination.ariaLabel,
+ first: '第一页',
+ last: '最后一页',
+ },
+ },
+ stepper: {
+ next: '下一步',
+ prev: '上一步',
+ },
+ loading: '加载中...',
+};
+
+const vuetifyLocaleMap: Record = {
+ 'zh-CN': 'zhHans',
+ 'en-US': 'en',
+ 'ru-RU': 'ru',
+};
+
+export const getVuetifyLocale = (locale?: string | null) => {
+ if (!locale) {
+ return 'zhHans';
+ }
+ return vuetifyLocaleMap[locale] || 'zhHans';
+};
export default createVuetify({
components,
directives,
+ locale: {
+ locale: getVuetifyLocale(
+ typeof localStorage === 'undefined'
+ ? null
+ : localStorage.getItem('astrbot-locale'),
+ ),
+ fallback: 'en',
+ messages: {
+ en,
+ ru,
+ zhHans: zhHansMessages,
+ },
+ },
theme: {
defaultTheme: 'PurpleTheme',
themes: {
PurpleTheme,
- PurpleThemeDark
- }
+ PurpleThemeDark,
+ },
},
defaults: {
VBtn: {},
VCard: {
- rounded: 'lg'
+ rounded: 'lg',
},
VTextField: {
- rounded: 'lg'
+ rounded: 'lg',
},
VTooltip: {
// set v-tooltip default location to top
- location: 'top'
- }
- }
+ location: 'top',
+ },
+ },
});
diff --git a/dashboard/src/views/knowledge-base/DocumentDetail.vue b/dashboard/src/views/knowledge-base/DocumentDetail.vue
index 921315e627..212c848e09 100644
--- a/dashboard/src/views/knowledge-base/DocumentDetail.vue
+++ b/dashboard/src/views/knowledge-base/DocumentDetail.vue
@@ -9,7 +9,9 @@
/>
@@ -18,18 +20,29 @@
+
+
+ {{ loadError }}
+
+ {{ t("actions.retry") }}
+
+
+
+
- {{ t('info.title') }}
+ {{ t("info.title") }}
mdi-label
-
{{ t('info.name') }}
+
+ {{ t("info.name") }}
+
{{ document.doc_name }}
@@ -40,8 +53,10 @@
{{ getFileIcon(document.file_type) }}
-
{{ t('info.type') }}
-
{{ document.file_type || '-' }}
+
+ {{ t("info.type") }}
+
+
{{ document.file_type || "-" }}
@@ -49,8 +64,12 @@
mdi-file-chart
-
{{ t('info.size') }}
-
{{ formatFileSize(document.file_size) }}
+
+ {{ t("info.size") }}
+
+
+ {{ formatFileSize(document.file_size) }}
+
@@ -58,7 +77,9 @@
mdi-text-box
-
{{ t('info.chunkCount') }}
+
+ {{ t("info.chunkCount") }}
+
{{ document.chunk_count || 0 }}
@@ -67,8 +88,12 @@
mdi-calendar
-
{{ t('info.createdAt') }}
-
{{ formatDate(document.created_at) }}
+
+ {{ t("info.createdAt") }}
+
+
+ {{ formatDate(document.created_at) }}
+
@@ -76,88 +101,325 @@
+
+ {{ t("processing.title") }}
+
+
+
+
+
+ {{ getDocumentStatusIcon(document.status) }}
+
+
+
+ {{ t("processing.status") }}
+
+
+ {{ getDocumentStatusText(document.status) }}
+
+
+
+
+
+
+
mdi-source-branch
+
+
+ {{ t("processing.sourceType") }}
+
+
+ {{ getSourceTypeText(document.source_type) }}
+
+
+
+
+
+
+
mdi-counter
+
+
+ {{ t("processing.version") }}
+
+
+ {{ document.version || 1 }}
+
+
+
+
+
+
+
mdi-calendar-check
+
+
+ {{ t("processing.indexedAt") }}
+
+
+ {{ formatDate(document.indexed_at) }}
+
+
+
+
+
+
+
+
+
+
+
+
+
mdi-file-cog-outline
+
+
+
+
+
+
mdi-text-box-check-outline
+
+
+
+
+
+
mdi-file-replace-outline
+
+
+
+
+
+
+
+
+ {{ t("actions.retryRebuild") }}
+
+
+
+
+
+
-
- {{ t('chunks.title') }}
-
- {{ totalChunks }} {{ t('chunks.title') }}
-
-
-
+ />
-
-
-
- #{{ item.chunk_index + 1 }}
-
-
-
-
-
- {{ item.content }}
-
-
+
-
+