diff --git a/astrbot/core/db/vec_db/faiss_impl/document_storage.py b/astrbot/core/db/vec_db/faiss_impl/document_storage.py index 58ec8dc1c5..84069ba52f 100644 --- a/astrbot/core/db/vec_db/faiss_impl/document_storage.py +++ b/astrbot/core/db/vec_db/faiss_impl/document_storage.py @@ -1,5 +1,6 @@ import json import os +from asyncio import Lock from contextlib import asynccontextmanager from datetime import datetime from pathlib import Path @@ -17,6 +18,7 @@ build_fts5_or_query, load_stopwords, to_fts5_search_text, + tokenize_text, ) FTS_TABLE_NAME = "documents_fts" @@ -58,44 +60,49 @@ def __init__(self, db_path: str) -> None: self._fts_contentless_delete = False self._fts_index_ready = False self._stopwords: set[str] | None = None + self._fts_rebuild_lock = Lock() async def initialize(self) -> None: """Initialize the SQLite database and create the documents table if it doesn't exist.""" await self.connect() async with self.engine.begin() as conn: # type: ignore await self._ensure_documents_table(conn) - - try: - await conn.execute( - text( - "ALTER TABLE documents ADD COLUMN kb_doc_id TEXT " - "GENERATED ALWAYS AS (json_extract(metadata, '$.kb_doc_id')) STORED", - ), - ) - await conn.execute( - text( - "ALTER TABLE documents ADD COLUMN user_id TEXT " - "GENERATED ALWAYS AS (json_extract(metadata, '$.user_id')) STORED", - ), - ) - - # Create indexes - await conn.execute( - text( - "CREATE INDEX IF NOT EXISTS idx_documents_kb_doc_id ON documents(kb_doc_id)", - ), - ) - await conn.execute( - text( - "CREATE INDEX IF NOT EXISTS idx_documents_user_id ON documents(user_id)", - ), - ) - except BaseException: - pass + await self._ensure_generated_columns(conn) await self._initialize_fts5(conn) await conn.commit() + async def _table_columns(self, executor, table_name: str) -> set[str]: + result = await executor.execute(text(f"PRAGMA table_xinfo({table_name})")) + return {row[1] for row in result.fetchall()} + + async def _ensure_generated_columns(self, executor) -> None: + generated_columns = { + "kb_doc_id": "json_extract(metadata, '$.kb_doc_id')", + "user_id": "json_extract(metadata, '$.user_id')", + "kb_id": "json_extract(metadata, '$.kb_id')", + } + columns = await self._table_columns(executor, "documents") + for column_name, expression in generated_columns.items(): + if column_name in columns: + continue + await executor.execute( + text( + f"ALTER TABLE documents ADD COLUMN {column_name} TEXT " + f"GENERATED ALWAYS AS ({expression}) VIRTUAL", + ), + ) + columns.add(column_name) + + index_statements = [ + "CREATE INDEX IF NOT EXISTS idx_documents_kb_doc_id " + "ON documents(kb_doc_id)", + "CREATE INDEX IF NOT EXISTS idx_documents_user_id ON documents(user_id)", + "CREATE INDEX IF NOT EXISTS idx_documents_kb_id ON documents(kb_id)", + ] + for statement in index_statements: + await executor.execute(text(statement)) + async def _ensure_documents_table(self, executor) -> None: """Create the document table from the SQLModel definition.""" result = await executor.execute( @@ -302,11 +309,11 @@ async def get_documents( async with self.get_session() as session: query = select(Document) - - for key, val in metadata_filters.items(): - query = query.where( - text(f"json_extract(metadata, '$.{key}') = :filter_{key}"), - ).params(**{f"filter_{key}": val}) + query = await self._apply_metadata_filters( + session, + query, + metadata_filters, + ) if ids is not None and len(ids) > 0: valid_ids = [int(i) for i in ids if i != -1] @@ -468,11 +475,11 @@ async def delete_documents(self, metadata_filters: dict) -> None: async with self.get_session() as session, session.begin(): query = select(Document) - - for key, val in metadata_filters.items(): - query = query.where( - text(f"json_extract(metadata, '$.{key}') = :filter_{key}"), - ).params(**{f"filter_{key}": val}) + query = await self._apply_metadata_filters( + session, + query, + metadata_filters, + ) result = await session.execute(query) documents = result.scalars().all() @@ -499,15 +506,144 @@ async def count_documents(self, metadata_filters: dict | None = None) -> int: query = select(func.count(col(Document.id))) if metadata_filters: - for key, val in metadata_filters.items(): - query = query.where( - text(f"json_extract(metadata, '$.{key}') = :filter_{key}"), - ).params(**{f"filter_{key}": val}) + query = await self._apply_metadata_filters( + session, + query, + metadata_filters, + ) result = await session.execute(query) count = result.scalar_one_or_none() return count if count is not None else 0 + async def search_documents( + self, + query_text: str, + metadata_filters: dict | None = None, + offset: int = 0, + limit: int = 100, + ) -> tuple[list[dict], int] | None: + """Search documents with FTS5 and optional metadata filters. + + Returns None when FTS5 is unavailable so callers can choose whether to + fall back to an alternate search strategy. + """ + if limit <= 0: + return [], 0 + if not await self.ensure_fts_index(): + return None + + match_query = build_fts5_or_query(tokenize_text(query_text, self.stopwords)) + if not match_query: + return [], 0 + + metadata_filters = metadata_filters or {} + async with self.get_session() as session: + filters_sql, filter_params = await self._metadata_filter_sql( + session, + metadata_filters, + table_alias="d", + ) + where_clause = f"{FTS_TABLE_NAME} MATCH :query" + if filters_sql: + where_clause = f"{where_clause} AND {' AND '.join(filters_sql)}" + params = { + "query": match_query, + "limit": int(limit), + "offset": int(offset), + **filter_params, + } + try: + count_result = await session.execute( + text( + f""" + SELECT count(*) + FROM {FTS_TABLE_NAME} + JOIN documents d ON d.id = {FTS_TABLE_NAME}.rowid + WHERE {where_clause} + """, + ), + params, + ) + total = int(count_result.scalar_one_or_none() or 0) + result = await session.execute( + text( + f""" + SELECT + d.id AS id, + d.doc_id AS doc_id, + d.text AS text, + d.metadata AS metadata, + d.created_at AS created_at, + d.updated_at AS updated_at, + bm25({FTS_TABLE_NAME}) AS score + FROM {FTS_TABLE_NAME} + JOIN documents d ON d.id = {FTS_TABLE_NAME}.rowid + WHERE {where_clause} + ORDER BY score ASC, d.id ASC + LIMIT :limit + OFFSET :offset + """, + ), + params, + ) + except Exception as e: + logger.warning( + f"FTS5 document search failed for {self.db_path}: {e}", + ) + self.fts5_available = False + return None + + rows = result.mappings().all() + return [ + { + "id": row["id"], + "doc_id": row["doc_id"], + "text": row["text"], + "metadata": row["metadata"], + "created_at": row["created_at"], + "updated_at": row["updated_at"], + "score": float(row["score"]), + } + for row in rows + ], total + + async def _apply_metadata_filters( + self, + session: AsyncSession, + query, + metadata_filters: dict, + ): + filters_sql, params = await self._metadata_filter_sql( + session, + metadata_filters, + ) + for filter_sql in filters_sql: + query = query.where(text(filter_sql)) + if params: + query = query.params(**params) + return query + + async def _metadata_filter_sql( + self, + session: AsyncSession, + metadata_filters: dict, + table_alias: str | None = None, + ) -> tuple[list[str], dict]: + columns = await self._table_columns(session, "documents") + prefix = f"{table_alias}." if table_alias else "" + filters_sql = [] + params = {} + for key, val in metadata_filters.items(): + if key in {"kb_id", "kb_doc_id", "user_id"} and key in columns: + filters_sql.append(f"{prefix}{key} = :filter_{key}") + else: + filters_sql.append( + f"json_extract({prefix}metadata, '$.{key}') = :filter_{key}" + ) + params[f"filter_{key}"] = val + return filters_sql, params + async def ensure_fts_index(self) -> bool: """Ensure the FTS5 sparse index exists and matches the documents table.""" if not self.fts5_available: @@ -517,22 +653,30 @@ async def ensure_fts_index(self) -> bool: assert self.engine is not None, "Database connection is not initialized." - async with self.get_session() as session: - doc_count = await self._count_documents_in_session(session) - fts_count = await self._count_fts_rows(session) - if doc_count == fts_count: - self._fts_index_ready = True + async with self._fts_rebuild_lock: + if self._fts_index_ready: return True - logger.info( - f"Rebuilding FTS5 sparse index for {self.db_path}: " - f"documents={doc_count}, fts_rows={fts_count}", - ) - await self.rebuild_fts_index() - return self.fts5_available + async with self.get_session() as session: + doc_count = await self._count_documents_in_session(session) + fts_count = await self._count_fts_rows(session) + if doc_count == fts_count: + self._fts_index_ready = True + return True + + logger.info( + f"Rebuilding FTS5 sparse index for {self.db_path}: " + f"documents={doc_count}, fts_rows={fts_count}", + ) + await self._rebuild_fts_index_unlocked() + return self.fts5_available async def rebuild_fts_index(self) -> None: """Rebuild the contentless FTS5 sparse index from documents.""" + async with self._fts_rebuild_lock: + await self._rebuild_fts_index_unlocked() + + async def _rebuild_fts_index_unlocked(self) -> None: if not self.fts5_available: return @@ -577,7 +721,7 @@ async def search_sparse( sparse retrieval implementation. """ if limit <= 0: - return [] + return None if not await self.ensure_fts_index(): return None diff --git a/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py b/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py index dc6977cf8a..d7d9479046 100644 --- a/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py +++ b/astrbot/core/db/vec_db/faiss_impl/embedding_storage.py @@ -4,21 +4,180 @@ raise ImportError( "faiss 未安装。请使用 'pip install faiss-cpu' 或 'pip install faiss-gpu' 安装。", ) +import asyncio import os +import shutil +from datetime import datetime, timezone +from pathlib import Path import numpy as np +def _safe_normalize_l2(vectors: np.ndarray) -> None: + """L2 归一化,对零向量抛出明确错误 + + 正常的 embedding 模型不应产生零向量。零向量无法归一化(会产生 NaN), + 说明 embedding provider 返回了异常数据,应当尽早暴露问题。 + """ + # 检测全零行 + if vectors.ndim == 2: + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + zero_count = int((norms < 1e-12).sum()) + if zero_count > 0: + raise ValueError( + f"向量归一化失败:检测到 {zero_count} 个零向量。" + "Embedding Provider 返回了全零向量,这可能说明 API 密钥无效、" + "模型不支持当前输入、或服务端异常。请检查 Embedding Provider 配置。" + ) + elif vectors.ndim == 1: + if np.linalg.norm(vectors) < 1e-12: + raise ValueError( + "向量归一化失败:检测到零向量。" + "Embedding Provider 返回了全零向量,这可能说明 API 密钥无效、" + "模型不支持当前输入、或服务端异常。请检查 Embedding Provider 配置。" + ) + + faiss.normalize_L2(vectors) + + class EmbeddingStorage: - def __init__(self, dimension: int, path: str | None = None) -> None: + def __init__( + self, + dimension: int, + path: str | None = None, + index_type: str = "flat", + ) -> None: self.dimension = dimension self.path = path self.index = None + self.index_type = index_type # "flat" | "hnsw" + self._write_lock = asyncio.Lock() if path and os.path.exists(path): self.index = faiss.read_index(path) + # 验证加载的索引维度是否匹配 + loaded_dim = self.index.d + if loaded_dim != self.dimension: + raise ValueError( + f"索引维度不匹配: 磁盘索引维度={loaded_dim}, " + f"当前 Embedding Provider 维度={self.dimension}。" + f"请确认 Embedding Provider 与已有索引一致," + f"或删除旧索引后重新创建知识库。" + ) + self._migrate_l2_to_ip_if_needed() else: - base_index = faiss.IndexFlatL2(dimension) + self.index = self._create_index() + + def _create_index(self): + """根据 index_type 创建 FAISS 索引""" + if self.index_type == "hnsw": + # HNSW32 with Inner Product metric for cosine similarity + base_index = faiss.index_factory( + self.dimension, + "HNSW32", + faiss.METRIC_INNER_PRODUCT, + ) + return faiss.IndexIDMap(base_index) + # 默认: flat (精确搜索) + return faiss.IndexIDMap(faiss.IndexFlatIP(self.dimension)) + + def _migrate_l2_to_ip_if_needed(self) -> None: + """检测并迁移旧版 L2 索引到 IP (余弦相似度) + + 旧版使用 IndexFlatL2,新版使用 IndexFlatIP + 归一化向量。 + 迁移过程:保留原 external ids → reconstruct 所有向量 → L2 归一化 → 重建为 IP 索引。 + """ + assert self.index is not None + # IndexIDMap 包装了 base index,需要解包检查 + base_index = self.index.index if hasattr(self.index, "index") else self.index + if getattr(base_index, "metric_type", None) != faiss.METRIC_L2: + return # 已经是 IP 或其他类型,无需迁移 + + import warnings + + ntotal = self.index.ntotal + if ntotal == 0: + warnings.warn( + "检测到空的旧版 L2 索引,将重建为 IP 索引。", + stacklevel=2, + ) + base_index = faiss.IndexFlatIP(self.dimension) self.index = faiss.IndexIDMap(base_index) + return + + warnings.warn( + f"检测到旧版 L2 索引 (含 {ntotal} 个向量),正在自动迁移到 IP 索引..." + "这可能需要几秒钟。迁移后旧索引将被覆盖。", + stacklevel=2, + ) + + # 重建所有向量并归一化 + # 注意: IndexIDMap.reconstruct 在某些 FAISS 构建版本中不可用 + try: + ids = self._get_index_ids() + vectors = np.zeros((ntotal, self.dimension), dtype=np.float32) + reconstruct_index = ( + self.index.index if hasattr(self.index, "index") else self.index + ) + for pos in range(ntotal): + vectors[pos] = reconstruct_index.reconstruct(pos) + except Exception as exc: + raise RuntimeError( + "无法从旧索引重建向量(reconstruct 不可用)," + "已保留旧索引文件未覆盖。请重新上传文档或手动重建知识库索引。" + ) from exc + + _safe_normalize_l2(vectors) + + # 重建为 IP 索引 + new_index = faiss.IndexIDMap(faiss.IndexFlatIP(self.dimension)) + new_index.add_with_ids(vectors, ids) + + self._backup_existing_index_before_migration() + self.index = new_index + # 立即保存迁移后的索引 + faiss.write_index(self.index, self.path) + + def _backup_existing_index_before_migration(self) -> Path: + if self.path is None: + raise RuntimeError("无法备份旧索引:索引文件路径为空,已保留旧索引未覆盖。") + + index_path = Path(self.path) + if not index_path.exists(): + raise RuntimeError( + f"无法备份旧索引:索引文件不存在 {index_path},已保留旧索引未覆盖。" + ) + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + backup_path = index_path.with_name(f"{index_path.name}.bak.{timestamp}") + counter = 1 + while backup_path.exists(): + backup_path = index_path.with_name( + f"{index_path.name}.bak.{timestamp}.{counter}" + ) + counter += 1 + + try: + shutil.copy2(index_path, backup_path) + except OSError as exc: + raise RuntimeError( + f"无法备份旧索引到 {backup_path},已保留旧索引未覆盖。" + ) from exc + + return backup_path + + def _get_index_ids(self) -> np.ndarray: + assert self.index is not None + ntotal = self.index.ntotal + id_map = getattr(self.index, "id_map", None) + if id_map is None: + return np.arange(ntotal, dtype=np.int64) + + ids = faiss.vector_to_array(id_map).astype(np.int64) + if len(ids) != ntotal: + raise RuntimeError( + f"FAISS IDMap 数量异常: ntotal={ntotal}, id_map={len(ids)}", + ) + return ids async def insert(self, vector: np.ndarray, id: int) -> None: """插入向量 @@ -30,13 +189,16 @@ async def insert(self, vector: np.ndarray, id: int) -> None: ValueError: 如果向量的维度与存储的维度不匹配 """ - assert self.index is not None, "FAISS index is not initialized." - if vector.shape[0] != self.dimension: - raise ValueError( - f"向量维度不匹配, 期望: {self.dimension}, 实际: {vector.shape[0]}", - ) - self.index.add_with_ids(vector.reshape(1, -1), np.array([id])) - await self.save_index() + async with self._write_lock: + assert self.index is not None, "FAISS index is not initialized." + if vector.shape[0] != self.dimension: + raise ValueError( + f"向量维度不匹配, 期望: {self.dimension}, 实际: {vector.shape[0]}", + ) + v_2d = vector.reshape(1, -1) + _safe_normalize_l2(v_2d) + self.index.add_with_ids(v_2d, np.array([id])) + await self._save_index_locked() async def insert_batch(self, vectors: np.ndarray, ids: list[int]) -> None: """批量插入向量 @@ -48,13 +210,15 @@ async def insert_batch(self, vectors: np.ndarray, ids: list[int]) -> None: ValueError: 如果向量的维度与存储的维度不匹配 """ - assert self.index is not None, "FAISS index is not initialized." - if vectors.shape[1] != self.dimension: - raise ValueError( - f"向量维度不匹配, 期望: {self.dimension}, 实际: {vectors.shape[1]}", - ) - self.index.add_with_ids(vectors, np.array(ids)) - await self.save_index() + async with self._write_lock: + assert self.index is not None, "FAISS index is not initialized." + if vectors.shape[1] != self.dimension: + raise ValueError( + f"向量维度不匹配, 期望: {self.dimension}, 实际: {vectors.shape[1]}", + ) + _safe_normalize_l2(vectors) + self.index.add_with_ids(vectors, np.array(ids)) + await self._save_index_locked() async def search(self, vector: np.ndarray, k: int) -> tuple: """搜索最相似的向量 @@ -67,7 +231,7 @@ async def search(self, vector: np.ndarray, k: int) -> tuple: """ assert self.index is not None, "FAISS index is not initialized." - faiss.normalize_L2(vector) + _safe_normalize_l2(vector) distances, indices = self.index.search(vector, k) return distances, indices @@ -78,18 +242,25 @@ async def delete(self, ids: list[int]) -> None: ids (list[int]): 要删除的向量ID列表 """ - assert self.index is not None, "FAISS index is not initialized." - id_array = np.array(ids, dtype=np.int64) - self.index.remove_ids(id_array) - await self.save_index() - - async def save_index(self) -> None: - """保存索引 + async with self._write_lock: + assert self.index is not None, "FAISS index is not initialized." + id_array = np.array(ids, dtype=np.int64) + self.index.remove_ids(id_array) + await self._save_index_locked() - Args: - path (str): 保存索引的路径 + async def _save_index_locked(self) -> None: + """内部方法:在已持有 _write_lock 的情况下保存索引到磁盘。 + 调用者必须已经获取 _write_lock。 """ if self.index is None: return - faiss.write_index(self.index, self.path) + await asyncio.to_thread(faiss.write_index, self.index, self.path) + + async def save_index(self) -> None: + """保存索引(在单独线程中执行以避免阻塞事件循环) + + 公共方法,自动获取写锁以确保线程安全。 + """ + async with self._write_lock: + await self._save_index_locked() diff --git a/astrbot/core/db/vec_db/faiss_impl/vec_db.py b/astrbot/core/db/vec_db/faiss_impl/vec_db.py index 0474683754..1cafd1c45d 100644 --- a/astrbot/core/db/vec_db/faiss_impl/vec_db.py +++ b/astrbot/core/db/vec_db/faiss_impl/vec_db.py @@ -1,5 +1,7 @@ import time import uuid +from collections import OrderedDict +from hashlib import sha256 import numpy as np @@ -12,6 +14,50 @@ from .embedding_storage import EmbeddingStorage +class EmbeddingCache: + """基于 LRU 的文本 → 嵌入向量缓存(线程安全) + + 使用 SHA256 哈希文本作为缓存 key,避免对相同内容重复调用 embedding API。 + """ + + def __init__(self, max_size: int = 10000) -> None: + import asyncio + + self._cache: OrderedDict[str, np.ndarray] = OrderedDict() + self._max_size = max_size + self._lock = asyncio.Lock() + + @staticmethod + def _hash(text: str) -> str: + return sha256(text.encode()).hexdigest() + + async def get(self, text: str) -> np.ndarray | None: + async with self._lock: + key = self._hash(text) + if key in self._cache: + self._cache.move_to_end(key) + return self._cache[key].copy() + return None + + async def put(self, text: str, embedding: np.ndarray) -> None: + async with self._lock: + key = self._hash(text) + if key not in self._cache: + if len(self._cache) >= self._max_size: + self._cache.popitem(last=False) + else: + self._cache.move_to_end(key) + self._cache[key] = embedding.copy() + + async def clear(self) -> None: + async with self._lock: + self._cache.clear() + + async def __len__(self) -> int: + async with self._lock: + return len(self._cache) + + class FaissVecDB(BaseVecDB): """A class to represent a vector database.""" @@ -21,6 +67,7 @@ def __init__( index_store_path: str, embedding_provider: EmbeddingProvider, rerank_provider: RerankProvider | None = None, + index_type: str = "flat", ) -> None: self.doc_store_path = doc_store_path self.index_store_path = index_store_path @@ -29,9 +76,11 @@ def __init__( self.embedding_storage = EmbeddingStorage( embedding_provider.get_dim(), index_store_path, + index_type=index_type, ) self.embedding_provider = embedding_provider self.rerank_provider = rerank_provider + self.embedding_cache = EmbeddingCache() async def initialize(self) -> None: await self.document_storage.initialize() @@ -81,6 +130,9 @@ async def insert_batch( ) return [] + # 空列表快速返回后,确保不再处理零向量 + assert len(contents) > 0, "contents must not be empty" + content_count = len(contents) if len(metadatas) != content_count: raise KnowledgeBaseUploadError( @@ -107,54 +159,65 @@ async def insert_batch( }, ) + # 检查嵌入缓存,分离已缓存的文本和需要计算的文本 start = time.time() - logger.debug(f"Generating embeddings for {len(contents)} contents...") - vectors = await self.embedding_provider.get_embeddings_batch( - contents, - batch_size=batch_size, - tasks_limit=tasks_limit, - max_retries=max_retries, - progress_callback=progress_callback, - ) - end = time.time() + cached_vectors: dict[int, np.ndarray] = {} + uncached_indices: list[int] = [] + uncached_texts: list[str] = [] + + for idx, text in enumerate(contents): + cached = await self.embedding_cache.get(text) + if cached is not None: + cached_vectors[idx] = cached + else: + uncached_indices.append(idx) + uncached_texts.append(text) + + cache_hits = len(cached_vectors) + cache_misses = len(uncached_texts) logger.debug( - f"Generated embeddings for {len(contents)} contents in {end - start:.2f} seconds.", + f"Embedding cache: {cache_hits} hits, {cache_misses} misses " + f"out of {len(contents)} contents.", ) - if len(vectors) != content_count: - raise KnowledgeBaseUploadError( - stage="embedding", - user_message=( - "向量化失败:嵌入模型返回的向量数量与文本分块数量不一致" - f"(期望 {content_count},实际 {len(vectors)})。" - "这通常说明当前 Embedding 接口未完整返回批量结果," - "或该服务不兼容当前批量请求格式。" - ), - details={ - "expected_contents": content_count, - "actual_vectors": len(vectors), - }, + + # 只对未缓存的文本生成嵌入 + vectors = [np.empty(0, dtype=np.float32) for _ in contents] + if uncached_texts: + new_embeddings = await self.embedding_provider.get_embeddings_batch( + uncached_texts, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, ) + # 验证返回数量 + if len(new_embeddings) != len(uncached_texts): + raise KnowledgeBaseUploadError( + stage="embedding", + user_message=( + "向量化失败:嵌入模型返回的向量数量与文本分块数量不一致" + f"(期望 {len(uncached_texts)},实际 {len(new_embeddings)})。" + "这通常说明当前 Embedding 接口未完整返回批量结果," + "或该服务不兼容当前批量请求格式。" + ), + details={ + "expected_contents": len(uncached_texts), + "actual_vectors": len(new_embeddings), + }, + ) + for i, idx in enumerate(uncached_indices): + vectors[idx] = np.asarray(new_embeddings[i], dtype=np.float32) + await self.embedding_cache.put(uncached_texts[i], vectors[idx]) + + for idx, cached_vec in cached_vectors.items(): + vectors[idx] = cached_vec - # 使用 DocumentStorage 的批量插入方法 - int_ids = await self.document_storage.insert_documents_batch( - ids, - contents, - metadatas, + end = time.time() + logger.debug( + f"Embeddings ready for {len(contents)} contents " + f"in {end - start:.2f}s (cached: {cache_hits}, fresh: {cache_misses}).", ) - if len(int_ids) != content_count: - raise KnowledgeBaseUploadError( - stage="storage", - user_message=( - f"存储失败:写入文档索引后返回的内部 ID 数量与文本分块数量不一致" - f"(期望 {content_count},实际 {len(int_ids)})。" - ), - details={ - "expected_contents": content_count, - "actual_int_ids": len(int_ids), - }, - ) - # 批量插入向量到 FAISS try: vectors_array = np.asarray(vectors, dtype=np.float32) except (TypeError, ValueError) as exc: @@ -187,9 +250,63 @@ async def insert_batch( "actual_dimension": int(vectors_array.shape[1]), }, ) - await self.embedding_storage.insert_batch(vectors_array, int_ids) + + int_ids = await self.document_storage.insert_documents_batch( + ids, + contents, + metadatas, + ) + if len(int_ids) != content_count: + await self._cleanup_batch_insert(int_ids=[], doc_ids=ids) + raise KnowledgeBaseUploadError( + stage="storage", + user_message=( + f"存储失败:写入文档索引后返回的内部 ID 数量与文本分块数量不一致" + f"(期望 {content_count},实际 {len(int_ids)})。" + ), + details={ + "expected_contents": content_count, + "actual_int_ids": len(int_ids), + }, + ) + + try: + await self.embedding_storage.insert_batch(vectors_array, int_ids) + except Exception: + logger.warning( + "Failed to insert FAISS vectors; cleaning up inserted document rows.", + exc_info=True, + ) + await self._cleanup_batch_insert(int_ids=int_ids, doc_ids=ids) + raise return int_ids + async def _cleanup_batch_insert( + self, + *, + int_ids: list[int], + doc_ids: list[str], + ) -> None: + """Best-effort cleanup for a failed batch insert.""" + if int_ids: + try: + await self.embedding_storage.delete(int_ids) + except Exception: + logger.warning( + "Failed to clean up FAISS vectors after batch insert failure.", + exc_info=True, + ) + + for doc_id in doc_ids: + try: + await self.document_storage.delete_document_by_doc_id(doc_id) + except Exception: + logger.warning( + f"Failed to clean up document row {doc_id} " + "after batch insert failure.", + exc_info=True, + ) + async def retrieve( self, query: str, @@ -211,15 +328,24 @@ async def retrieve( List[Result]: 查询结果 """ - embedding = await self.embedding_provider.get_embedding(query) + # 先查缓存,再调 embedding provider + cached = await self.embedding_cache.get(query) + if cached is not None: + embedding = cached + else: + embedding = await self.embedding_provider.get_embedding(query) + await self.embedding_cache.put( + query, + np.asarray(embedding, dtype=np.float32), + ) scores, indices = await self.embedding_storage.search( vector=np.array([embedding]).astype("float32"), k=fetch_k if metadata_filters else k, ) if len(indices[0]) == 0 or indices[0][0] == -1: return [] - # normalize scores - scores[0] = 1.0 - (scores[0] / 2.0) + # 将内积分数 (余弦相似度, 范围 [-1, 1]) 映射到 [0, 1] + scores[0] = (scores[0] + 1.0) / 2.0 # NOTE: maybe the size is less than k. fetched_docs = await self.document_storage.get_documents( metadata_filters=metadata_filters or {}, @@ -255,17 +381,18 @@ async def retrieve( return top_k_results - async def delete(self, doc_id: str) -> None: + async def delete(self, doc_id: str) -> bool: """删除一条文档块(chunk)""" # 获得对应的 int id result = await self.document_storage.get_document_by_doc_id(doc_id) int_id = result["id"] if result else None if int_id is None: - return + return False # 使用 DocumentStorage 的删除方法 await self.document_storage.delete_document_by_doc_id(doc_id) await self.embedding_storage.delete([int_id]) + return True async def close(self) -> None: await self.document_storage.close() diff --git a/astrbot/core/knowledge_base/capabilities.py b/astrbot/core/knowledge_base/capabilities.py new file mode 100644 index 0000000000..9367604514 --- /dev/null +++ b/astrbot/core/knowledge_base/capabilities.py @@ -0,0 +1,110 @@ +"""Knowledge base capabilities and default limits.""" + +from typing import Any + +ALLOWED_UPLOAD_EXTENSIONS = frozenset( + { + "adoc", + "docx", + "epub", + "md", + "markdown", + "pdf", + "rst", + "txt", + "xls", + "xlsx", + }, +) + +MAX_UPLOAD_FILE_SIZE = 128 * 1024 * 1024 +MAX_UPLOAD_FILES = 10 +MAX_BATCH_DELETE_DOCUMENTS = 100 +MAX_BATCH_REBUILD_DOCUMENTS = 100 +MAX_RETRIEVE_TOP_K = 100 +DEFAULT_KB_PAGE_SIZE = 20 +DEFAULT_DOCUMENT_PAGE_SIZE = 10 +DEFAULT_CHUNK_PAGE_SIZE = 10 +DEFAULT_BULK_PAGE_SIZE = 100 +DOCUMENT_PAGE_SIZE_OPTIONS = (10, 20, 50, 100) +CHUNK_PAGE_SIZE_OPTIONS = (10, 25, 50, 100) + +DOCUMENT_FILTER_STATUSES = ( + "pending", + "parsing", + "chunking", + "embedding", + "ready", + "failed", +) +DOCUMENT_FILTER_SOURCE_TYPES = ("file", "url", "import") + +FEATURE_SPARSE_RETRIEVAL = True +FEATURE_RERANK = True +FEATURE_URL_IMPORT = True +FEATURE_DOCUMENT_REBUILD = True +FEATURE_KB_REBUILD = True +FEATURE_CONSISTENCY_CHECK = True +FEATURE_CONSISTENCY_REPAIR = True +FEATURE_BATCH_DELETE = True +FEATURE_BATCH_REBUILD = True + +DEFAULT_CHUNK_SIZE = 512 +DEFAULT_CHUNK_OVERLAP = 50 +DEFAULT_TOP_K_DENSE = 50 +DEFAULT_TOP_K_SPARSE = 50 +DEFAULT_TOP_M_FINAL = 5 +DEFAULT_INDEX_TYPE = "flat" +DEFAULT_UPLOAD_BATCH_SIZE = 32 +DEFAULT_UPLOAD_TASKS_LIMIT = 3 +DEFAULT_UPLOAD_MAX_RETRIES = 3 + + +def get_knowledge_base_capabilities() -> dict[str, Any]: + """Return API-safe knowledge base capabilities.""" + return { + "upload": { + "allowed_extensions": sorted(ALLOWED_UPLOAD_EXTENSIONS), + "max_file_size_bytes": MAX_UPLOAD_FILE_SIZE, + "max_files_per_upload": MAX_UPLOAD_FILES, + }, + "defaults": { + "chunk_size": DEFAULT_CHUNK_SIZE, + "chunk_overlap": DEFAULT_CHUNK_OVERLAP, + "batch_size": DEFAULT_UPLOAD_BATCH_SIZE, + "tasks_limit": DEFAULT_UPLOAD_TASKS_LIMIT, + "max_retries": DEFAULT_UPLOAD_MAX_RETRIES, + "top_k_dense": DEFAULT_TOP_K_DENSE, + "top_k_sparse": DEFAULT_TOP_K_SPARSE, + "top_m_final": DEFAULT_TOP_M_FINAL, + "index_type": DEFAULT_INDEX_TYPE, + }, + "limits": { + "max_retrieve_top_k": MAX_RETRIEVE_TOP_K, + "max_batch_delete_documents": MAX_BATCH_DELETE_DOCUMENTS, + "max_batch_rebuild_documents": MAX_BATCH_REBUILD_DOCUMENTS, + }, + "pagination": { + "document_page_size_options": list(DOCUMENT_PAGE_SIZE_OPTIONS), + "chunk_page_size_options": list(CHUNK_PAGE_SIZE_OPTIONS), + "default_kb_page_size": DEFAULT_KB_PAGE_SIZE, + "default_document_page_size": DEFAULT_DOCUMENT_PAGE_SIZE, + "default_chunk_page_size": DEFAULT_CHUNK_PAGE_SIZE, + "bulk_page_size": DEFAULT_BULK_PAGE_SIZE, + }, + "document_filters": { + "statuses": list(DOCUMENT_FILTER_STATUSES), + "source_types": list(DOCUMENT_FILTER_SOURCE_TYPES), + }, + "features": { + "sparse_retrieval": FEATURE_SPARSE_RETRIEVAL, + "rerank": FEATURE_RERANK, + "url_import": FEATURE_URL_IMPORT, + "document_rebuild": FEATURE_DOCUMENT_REBUILD, + "kb_rebuild": FEATURE_KB_REBUILD, + "consistency_check": FEATURE_CONSISTENCY_CHECK, + "consistency_repair": FEATURE_CONSISTENCY_REPAIR, + "batch_delete": FEATURE_BATCH_DELETE, + "batch_rebuild": FEATURE_BATCH_REBUILD, + }, + } diff --git a/astrbot/core/knowledge_base/chunking/markdown.py b/astrbot/core/knowledge_base/chunking/markdown.py index 9ace43110d..e8813bf470 100644 --- a/astrbot/core/knowledge_base/chunking/markdown.py +++ b/astrbot/core/knowledge_base/chunking/markdown.py @@ -16,10 +16,35 @@ class _Section: """解析后的 Markdown 章节""" heading_path: list[str] + title_path: list[str] + section_index: int | None text: str has_body: bool +@dataclass +class MarkdownChunk: + """A Markdown chunk with source structure metadata.""" + + text: str + title_path: list[str] | None = None + section_index: int | None = None + + +@dataclass +class _ChunkDraft: + text: str + has_body: bool + title_path: list[str] | None + section_index: int | None + + +@dataclass +class _MarkdownBlock: + kind: str + text: str + + class MarkdownChunker(BaseChunker): """Markdown 感知分块器 @@ -72,31 +97,29 @@ async def chunk(self, text: str, **kwargs) -> list[str]: list[str]: 分块后的文本列表 """ + chunks = await self.chunk_with_metadata(text, **kwargs) + return [chunk.text for chunk in chunks] + + async def chunk_with_metadata(self, text: str, **kwargs) -> list[MarkdownChunk]: + """Split Markdown text and keep per-chunk structure metadata.""" + text = self._strip_front_matter(text) if not text or not text.strip(): return [] chunk_size = kwargs.get("chunk_size", self.chunk_size) chunk_overlap = kwargs.get("chunk_overlap", self.chunk_overlap) - # 解析 Markdown 结构 sections = self._parse_sections(text) if not sections: - # 没有识别到标题结构,回退到递归分割 - return await self._fallback_chunker.chunk( + chunks = await self._split_section_preserving_blocks( text, chunk_size=chunk_size, chunk_overlap=chunk_overlap ) + return [MarkdownChunk(text=chunk) for chunk in chunks] - # 将 sections 转换为 raw chunks raw_chunks = await self._sections_to_chunks(sections, chunk_size, chunk_overlap) - - # 合并纯标题节到下一个有内容的 chunk merged = self._merge_heading_only_chunks(raw_chunks, chunk_size) - - # 合并过短的相邻 chunk - merged = self._merge_short_chunks(merged, chunk_size) - - return merged + return self._merge_short_chunks(merged, chunk_size) def _estimate_prefix_length(self, heading_path: list[str]) -> int: """估算标题上下文前缀的最大长度(用于扣除子块可用空间)""" @@ -109,13 +132,15 @@ def _estimate_prefix_length(self, heading_path: list[str]) -> int: async def _sections_to_chunks( self, sections: list[_Section], chunk_size: int, chunk_overlap: int - ) -> list[tuple[str, bool]]: + ) -> list[_ChunkDraft]: """将解析后的 sections 转换为 (chunk_text, has_body) 列表""" - raw_chunks: list[tuple[str, bool]] = [] + raw_chunks: list[_ChunkDraft] = [] for section in sections: section_text = section.text heading_path = section.heading_path + title_path = self._normalize_title_path(section.title_path) + section_index = section.section_index has_body = section.has_body # 构建带上下文的文本 @@ -123,23 +148,30 @@ async def _sections_to_chunks( full_text = context_prefix + section_text if len(full_text) <= chunk_size: - raw_chunks.append((full_text.strip(), has_body)) + raw_chunks.append( + _ChunkDraft( + text=full_text.strip(), + has_body=has_body, + title_path=title_path, + section_index=section_index, + ) + ) else: - # 章节过长,内部递归分割 - # 扣除前缀长度,确保添加前缀后不超过 chunk_size - prefix_len = self._estimate_prefix_length(heading_path) - effective_chunk_size = max(chunk_size // 4, chunk_size - prefix_len) - - sub_chunks = await self._fallback_chunker.chunk( + sub_chunks = await self._split_section_preserving_blocks( section_text, - chunk_size=effective_chunk_size, + heading_path=heading_path, + chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) for i, sub_chunk in enumerate(sub_chunks): - chunk_text = self._apply_heading_context( - heading_path, sub_chunk, is_continuation=(i > 0) + raw_chunks.append( + _ChunkDraft( + text=sub_chunk, + has_body=True, + title_path=title_path, + section_index=section_index, + ) ) - raw_chunks.append((chunk_text, True)) return raw_chunks @@ -161,75 +193,818 @@ def _apply_heading_context( return f"{self.continuation_prefix} {title}\n\n{content}".strip() return f"{title}\n\n{content}".strip() - def _merge_heading_only_chunks( - self, raw_chunks: list[tuple[str, bool]], chunk_size: int + async def _split_section_preserving_blocks( + self, + text: str, + *, + chunk_size: int, + chunk_overlap: int, + heading_path: list[str] | None = None, + ) -> list[str]: + heading_path = heading_path or [] + prefix_len = self._estimate_prefix_length(heading_path) + effective_chunk_size = max(chunk_size // 4, chunk_size - prefix_len) + blocks = self._parse_markdown_blocks(text) + if not blocks: + chunks = await self._fallback_chunker.chunk( + text, + chunk_size=effective_chunk_size, + chunk_overlap=chunk_overlap, + ) + return [ + self._apply_heading_context(heading_path, chunk, i > 0) + for i, chunk in enumerate(chunks) + if chunk.strip() + ] + + chunks: list[str] = [] + current = "" + piece_index = 0 + + for block in blocks: + pieces = await self._split_block(block, effective_chunk_size, chunk_overlap) + for piece in pieces: + piece = piece.strip() + if not piece: + continue + if not current: + current = piece + continue + combined = current + "\n\n" + piece + if len(combined) <= effective_chunk_size: + current = combined + continue + + chunks.append( + self._apply_heading_context( + heading_path, + current, + piece_index > 0, + ) + ) + piece_index += 1 + current = piece + + if current: + chunks.append( + self._apply_heading_context( + heading_path, + current, + piece_index > 0, + ) + ) + + return chunks + + async def _split_block( + self, block: _MarkdownBlock, chunk_size: int, chunk_overlap: int + ) -> list[str]: + text = block.text.strip() + if not text: + return [] + if len(text) <= chunk_size: + return [text] + + if block.kind == "table": + return self._split_table_block(text, chunk_size) + if block.kind == "code": + return self._split_fenced_code_block(text, chunk_size) + if block.kind == "math": + return self._split_wrapped_line_block(text, chunk_size) + if block.kind in {"blockquote", "list", "html"}: + return self._split_line_block(text, chunk_size) + if block.kind in {"paragraph", "text"}: + return self._split_text_preserving_inline_spans(text, chunk_size) + + return await self._fallback_chunker.chunk( + text, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + + def _parse_markdown_blocks(self, text: str) -> list[_MarkdownBlock]: + lines = text.splitlines(keepends=True) + blocks: list[_MarkdownBlock] = [] + i = 0 + while i < len(lines): + line = lines[i] + if not line.strip(): + i += 1 + continue + + if self._is_fence_start(line): + block_lines, i = self._collect_fenced_code_block(lines, i) + blocks.append(_MarkdownBlock("code", "".join(block_lines).strip())) + continue + + if self._is_math_block_start(line): + block_lines, i = self._collect_math_block(lines, i) + blocks.append(_MarkdownBlock("math", "".join(block_lines).strip())) + continue + + if self._is_markdown_table_start(lines, i): + block_lines, i = self._collect_markdown_table(lines, i) + blocks.append(_MarkdownBlock("table", "".join(block_lines).strip())) + continue + + if self._is_html_block_start(line): + block_lines, i = self._collect_html_block(lines, i) + blocks.append(_MarkdownBlock("html", "".join(block_lines).strip())) + continue + + if line.lstrip().startswith(">"): + block_lines, i = self._collect_prefixed_block( + lines, + i, + lambda candidate: candidate.lstrip().startswith(">"), + ) + blocks.append( + _MarkdownBlock("blockquote", "".join(block_lines).strip()) + ) + continue + + if self._is_list_item(line): + block_lines, i = self._collect_list_block(lines, i) + blocks.append(_MarkdownBlock("list", "".join(block_lines).strip())) + continue + + if self._is_link_reference(line): + block_lines, i = self._collect_prefixed_block( + lines, + i, + self._is_link_reference, + ) + blocks.append( + _MarkdownBlock("link_reference", "".join(block_lines).strip()) + ) + continue + + block_lines, i = self._collect_paragraph(lines, i) + blocks.append(_MarkdownBlock("paragraph", "".join(block_lines).strip())) + + return [block for block in blocks if block.text.strip()] + + @staticmethod + def _strip_front_matter(text: str) -> str: + if not text.startswith(("---\n", "+++\n")): + return text + + marker = text[:3] + lines = text.splitlines(keepends=True) + for idx in range(1, min(len(lines), 200)): + if lines[idx].strip() == marker: + return "".join(lines[idx + 1 :]).lstrip("\n") + return text + + @staticmethod + def _is_fence_start(line: str) -> bool: + stripped = line.lstrip() + indent = len(line) - len(stripped) + return indent <= 3 and ( + stripped.startswith("```") or stripped.startswith("~~~") + ) + + @staticmethod + def _fence_marker(line: str) -> tuple[str, int] | None: + stripped = line.lstrip() + if stripped.startswith("```"): + return "`", len(stripped) - len(stripped.lstrip("`")) + if stripped.startswith("~~~"): + return "~", len(stripped) - len(stripped.lstrip("~")) + return None + + def _collect_fenced_code_block( + self, lines: list[str], start: int + ) -> tuple[list[str], int]: + marker = self._fence_marker(lines[start]) + if marker is None: + return [lines[start]], start + 1 + fence_char, fence_len = marker + block_lines = [lines[start]] + i = start + 1 + while i < len(lines): + block_lines.append(lines[i]) + candidate = lines[i].lstrip() + indent = len(lines[i]) - len(candidate) + if ( + indent <= 3 + and candidate.startswith(fence_char * fence_len) + and set(candidate.strip()) <= {fence_char} + ): + i += 1 + break + i += 1 + return block_lines, i + + @staticmethod + def _is_table_separator(line: str) -> bool: + stripped = line.strip() + if "|" not in stripped: + return False + cells = [cell.strip() for cell in stripped.strip("|").split("|")] + if not cells: + return False + return all(re.fullmatch(r":?-{3,}:?", cell or "") for cell in cells) + + @staticmethod + def _is_table_row(line: str) -> bool: + stripped = line.strip() + return bool(stripped) and "|" in stripped + + def _is_markdown_table_start(self, lines: list[str], index: int) -> bool: + return ( + index + 1 < len(lines) + and self._is_table_row(lines[index]) + and self._is_table_separator(lines[index + 1]) + ) + + def _collect_markdown_table( + self, lines: list[str], start: int + ) -> tuple[list[str], int]: + block_lines = [lines[start], lines[start + 1]] + i = start + 2 + while i < len(lines) and self._is_table_row(lines[i]): + block_lines.append(lines[i]) + i += 1 + return block_lines, i + + @staticmethod + def _is_html_block_start(line: str) -> bool: + stripped = line.lstrip().lower() + return stripped.startswith( + ( + " str | None: + stripped = line.lstrip().lower() + for tag in ("table", "pre", "code", "blockquote", "details", "div"): + if stripped.startswith(f"<{tag}"): + return f"" + return None + + def _collect_html_block( + self, lines: list[str], start: int + ) -> tuple[list[str], int]: + closing_tag = self._html_closing_tag(lines[start]) + block_lines = [lines[start]] + i = start + 1 + if closing_tag is None or closing_tag in lines[start].lower(): + return block_lines, i + + while i < len(lines): + block_lines.append(lines[i]) + if closing_tag in lines[i].lower(): + i += 1 + break + i += 1 + return block_lines, i + + @staticmethod + def _is_list_item(line: str) -> bool: + return bool(re.match(r"^\s{0,3}(?:[-*+]|\d+[.)])\s+", line)) + + @staticmethod + def _is_link_reference(line: str) -> bool: + return bool(re.match(r"^\s{0,3}\[[^\]]+\]:\s+\S+", line)) + + def _collect_prefixed_block( + self, + lines: list[str], + start: int, + predicate, + ) -> tuple[list[str], int]: + block_lines = [] + i = start + while i < len(lines) and (predicate(lines[i]) or not lines[i].strip()): + if ( + not lines[i].strip() + and i + 1 < len(lines) + and not predicate(lines[i + 1]) + ): + break + block_lines.append(lines[i]) + i += 1 + return block_lines, i + + def _collect_list_block( + self, lines: list[str], start: int + ) -> tuple[list[str], int]: + block_lines = [lines[start]] + i = start + 1 + while i < len(lines): + line = lines[i] + if self._is_fence_start(line) or self._is_markdown_table_start(lines, i): + break + if self._is_list_item(line) or line.startswith((" ", "\t")): + block_lines.append(line) + i += 1 + continue + if not line.strip() and i + 1 < len(lines): + next_line = lines[i + 1] + if self._is_list_item(next_line) or next_line.startswith((" ", "\t")): + block_lines.append(line) + i += 1 + continue + break + return block_lines, i + + def _collect_paragraph(self, lines: list[str], start: int) -> tuple[list[str], int]: + block_lines = [] + i = start + while i < len(lines): + line = lines[i] + if not line.strip(): + break + if i != start and ( + self._is_fence_start(line) + or self._is_math_block_start(line) + or self._is_markdown_table_start(lines, i) + or self._is_html_block_start(line) + or self._is_list_item(line) + or line.lstrip().startswith(">") + or self._is_link_reference(line) + ): + break + block_lines.append(line) + i += 1 + return block_lines, i + + def _split_table_block(self, text: str, chunk_size: int) -> list[str]: + lines = text.splitlines() + if len(lines) <= 2: + return [text] + + header = lines[:2] + rows = lines[2:] + chunks = [] + current_rows: list[str] = [] + + for row in rows: + candidate_lines = header + current_rows + [row] + candidate = "\n".join(candidate_lines) + if current_rows and len(candidate) > chunk_size: + chunks.append("\n".join(header + current_rows)) + current_rows = [row] + else: + current_rows.append(row) + + if current_rows: + chunks.append("\n".join(header + current_rows)) + + return chunks or [text] + + @staticmethod + def _is_math_block_start(line: str) -> bool: + stripped = line.strip() + return ( + stripped.startswith("$$") + or stripped.startswith(r"\[") + or bool( + re.match( + r"^\\begin\{(?:equation|align|gather|multline|cases)\*?\}", stripped + ) + ) + ) + + @staticmethod + def _math_block_closer(line: str) -> str: + stripped = line.strip() + if stripped.startswith("$$"): + return "$$" + if stripped.startswith(r"\["): + return r"\]" + + env_match = re.match(r"^\\begin\{([^}]+)\}", stripped) + if env_match: + return rf"\end{{{env_match.group(1)}}}" + return "" + + def _collect_math_block( + self, lines: list[str], start: int + ) -> tuple[list[str], int]: + opener_line = lines[start] + closer = self._math_block_closer(opener_line) + block_lines = [opener_line] + if not closer: + return block_lines, start + 1 + + opener_stripped = opener_line.strip() + if ( + closer in opener_stripped[len(closer) :] + if closer in {"$$", r"\]"} + else closer in opener_stripped + ): + return block_lines, start + 1 + + i = start + 1 + while i < len(lines): + block_lines.append(lines[i]) + if closer in lines[i].strip(): + i += 1 + break + i += 1 + return block_lines, i + + @staticmethod + def _split_wrapped_line_block(text: str, chunk_size: int) -> list[str]: + lines = text.splitlines() + if len(lines) <= 2: + return [text] + + opener = lines[0] + closer = lines[-1] + body = lines[1:-1] + chunks = [] + current: list[str] = [] + + for line in body: + candidate = "\n".join([opener, *current, line, closer]) + if current and len(candidate) > chunk_size: + chunks.append("\n".join([opener, *current, closer])) + current = [line] + else: + current.append(line) + + if current: + chunks.append("\n".join([opener, *current, closer])) + + return chunks or [text] + + @staticmethod + def _split_fenced_code_block(text: str, chunk_size: int) -> list[str]: + lines = text.splitlines() + if len(lines) <= 2: + return [text] + + opener = lines[0] + closer = lines[-1] if lines[-1].lstrip().startswith(("```", "~~~")) else "" + body = lines[1:-1] if closer else lines[1:] + chunks = [] + current: list[str] = [] + + for line in body: + candidate_lines = [opener, *current, line] + if closer: + candidate_lines.append(closer) + candidate = "\n".join(candidate_lines) + if current and len(candidate) > chunk_size: + chunk_lines = [opener, *current] + if closer: + chunk_lines.append(closer) + chunks.append("\n".join(chunk_lines)) + current = [line] + else: + current.append(line) + + if current: + chunk_lines = [opener, *current] + if closer: + chunk_lines.append(closer) + chunks.append("\n".join(chunk_lines)) + + return chunks or [text] + + @staticmethod + def _split_line_block(text: str, chunk_size: int) -> list[str]: + lines = text.splitlines() + chunks = [] + current: list[str] = [] + for line in lines: + candidate = "\n".join([*current, line]) + if current and len(candidate) > chunk_size: + chunks.append("\n".join(current)) + current = [line] + else: + current.append(line) + if current: + chunks.append("\n".join(current)) + return chunks or [text] + + def _split_text_preserving_inline_spans( + self, text: str, chunk_size: int ) -> list[str]: + tokens = self._tokenize_protected_inline_spans(text) + chunks = [] + current = "" + for token in tokens: + if not token: + continue + candidate = current + token if current else token.lstrip() + if current and len(candidate) > chunk_size: + chunks.append(current.strip()) + current = token.lstrip() + else: + current = candidate + + if len(current) > chunk_size and not self._is_inline_protected_token( + current + ): + split_chunks = self._split_long_plain_token(current, chunk_size) + chunks.extend(split_chunks[:-1]) + current = split_chunks[-1] if split_chunks else "" + + if current.strip(): + chunks.append(current.strip()) + return [chunk for chunk in chunks if chunk] + + def _tokenize_protected_inline_spans(self, text: str) -> list[str]: + spans = self._find_protected_inline_spans(text) + tokens: list[str] = [] + cursor = 0 + for start, end in spans: + if start > cursor: + tokens.extend(re.findall(r"\S+\s*|\s+", text[cursor:start])) + tokens.append(text[start:end]) + cursor = end + if cursor < len(text): + tokens.extend(re.findall(r"\S+\s*|\s+", text[cursor:])) + return tokens + + def _find_protected_inline_spans(self, text: str) -> list[tuple[int, int]]: + spans: list[tuple[int, int]] = [] + i = 0 + while i < len(text): + end = self._match_markdown_link(text, i) + if end is None: + end = self._match_autolink(text, i) + if end is None: + end = self._match_inline_math(text, i) + if end is not None: + if not spans or i >= spans[-1][1]: + spans.append((i, end)) + i = end + continue + i += 1 + return spans + + @staticmethod + def _match_markdown_link(text: str, start: int) -> int | None: + marker_start = start + if text.startswith("![", start): + start += 1 + elif text[start] != "[": + return None + + label_end = text.find("]", start + 1) + if label_end == -1 or label_end + 1 >= len(text): + return None + + next_char = text[label_end + 1] + if next_char == "(": + link_end = text.find(")", label_end + 2) + return link_end + 1 if link_end != -1 else None + if next_char == "[": + ref_end = text.find("]", label_end + 2) + return ref_end + 1 if ref_end != -1 else None + + return None if marker_start == start else None + + @staticmethod + def _match_autolink(text: str, start: int) -> int | None: + if text.startswith(("", start + 1) + return end + 1 if end != -1 else None + + if not ( + text.startswith("http://", start) or text.startswith("https://", start) + ): + return None + + end = start + while end < len(text) and not text[end].isspace(): + end += 1 + while end > start and text[end - 1] in ".,;:!?)>]": + end -= 1 + return end + + @staticmethod + def _match_inline_math(text: str, start: int) -> int | None: + if text.startswith(r"\(", start): + end = text.find(r"\)", start + 2) + return end + 2 if end != -1 else None + + if text[start] != "$": + return None + if text.startswith("$$", start): + return None + if start > 0 and text[start - 1] == "\\": + return None + if start + 1 >= len(text) or text[start + 1].isspace(): + return None + + i = start + 1 + while i < len(text): + if text[i] == "$" and text[i - 1] != "\\": + if i > start + 1 and not text[i - 1].isspace(): + return i + 1 + return None + i += 1 + return None + + @staticmethod + def _is_inline_protected_token(token: str) -> bool: + stripped = token.strip() + return ( + stripped.startswith("[") + or stripped.startswith("![") + or stripped.startswith(" list[str]: + if chunk_size <= 0: + return [text] + return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] + + def _merge_heading_only_chunks( + self, raw_chunks: list[_ChunkDraft], chunk_size: int + ) -> list[MarkdownChunk]: """合并没有实质正文的 chunk 到下一个有正文的 chunk""" - merged: list[str] = [] - pending = "" + merged: list[MarkdownChunk] = [] + pending_text = "" + pending_title_path: list[str] | None = None + pending_section_index: int | None = None - for chunk_text, has_body in raw_chunks: + for chunk in raw_chunks: + chunk_text = chunk.text if not chunk_text: continue - if not has_body: + if not chunk.has_body: # 纯标题节,暂存;但如果 pending 已经够长,先 flush - if pending and len(pending) + len(chunk_text) + 2 > chunk_size: - merged.append(pending.strip()) - pending = "" - pending += chunk_text + "\n\n" + if ( + pending_text + and len(pending_text) + len(chunk_text) + 2 > chunk_size + ): + merged.append( + MarkdownChunk( + text=pending_text.strip(), + title_path=pending_title_path, + section_index=pending_section_index, + ) + ) + pending_text = "" + pending_title_path = None + pending_section_index = None + pending_text += chunk_text + "\n\n" + pending_title_path = chunk.title_path or pending_title_path + pending_section_index = chunk.section_index else: - if pending: - combined = pending + chunk_text + if pending_text: + combined = pending_text + chunk_text if len(combined) <= chunk_size: - merged.append(combined.strip()) + merged.append( + MarkdownChunk( + text=combined.strip(), + title_path=chunk.title_path or pending_title_path, + section_index=chunk.section_index, + ) + ) else: - merged.append(pending.strip()) - merged.append(chunk_text.strip()) - pending = "" + merged.append( + MarkdownChunk( + text=pending_text.strip(), + title_path=pending_title_path, + section_index=pending_section_index, + ) + ) + merged.append( + MarkdownChunk( + text=chunk_text.strip(), + title_path=chunk.title_path, + section_index=chunk.section_index, + ) + ) + pending_text = "" + pending_title_path = None + pending_section_index = None else: - merged.append(chunk_text.strip()) + merged.append( + MarkdownChunk( + text=chunk_text.strip(), + title_path=chunk.title_path, + section_index=chunk.section_index, + ) + ) # 处理尾部残留的 pending - if pending: - pending_text = pending.strip() - if merged and len(merged[-1] + "\n\n" + pending_text) <= chunk_size: - merged[-1] = merged[-1] + "\n\n" + pending_text + if pending_text: + trailing_text = pending_text.strip() + if merged and len(merged[-1].text + "\n\n" + trailing_text) <= chunk_size: + merged[-1] = MarkdownChunk( + text=merged[-1].text + "\n\n" + trailing_text, + title_path=self._merge_title_paths( + [merged[-1].title_path, pending_title_path] + ), + section_index=self._merge_section_indexes( + [merged[-1].section_index, pending_section_index] + ), + ) else: - merged.append(pending_text) + merged.append( + MarkdownChunk( + text=trailing_text, + title_path=pending_title_path, + section_index=pending_section_index, + ) + ) - return [c for c in merged if c.strip()] + return [chunk for chunk in merged if chunk.text.strip()] - def _merge_short_chunks(self, chunks: list[str], chunk_size: int) -> list[str]: + def _merge_short_chunks( + self, chunks: list[MarkdownChunk], chunk_size: int + ) -> list[MarkdownChunk]: """合并过短的相邻 chunk(低于 min_chunk_size)""" if self.min_chunk_size <= 0 or len(chunks) <= 1: return chunks - final: list[str] = [] - buf = "" + final: list[MarkdownChunk] = [] + buf: MarkdownChunk | None = None - for c in chunks: + for chunk in chunks: if buf: - combined = buf + "\n\n" + c + combined = buf.text + "\n\n" + chunk.text if len(combined) <= chunk_size: - buf = combined + buf = MarkdownChunk( + text=combined, + title_path=self._merge_title_paths( + [buf.title_path, chunk.title_path] + ), + section_index=self._merge_section_indexes( + [buf.section_index, chunk.section_index] + ), + ) else: final.append(buf) - buf = c if len(c) < self.min_chunk_size else "" - if len(c) >= self.min_chunk_size: - final.append(c) - elif len(c) < self.min_chunk_size: - buf = c + if len(chunk.text) < self.min_chunk_size: + buf = chunk + else: + buf = None + final.append(chunk) + elif len(chunk.text) < self.min_chunk_size: + buf = chunk else: - final.append(c) + final.append(chunk) if buf: - if final and len(final[-1] + "\n\n" + buf) <= chunk_size: - final[-1] = final[-1] + "\n\n" + buf + if final and len(final[-1].text + "\n\n" + buf.text) <= chunk_size: + final[-1] = MarkdownChunk( + text=final[-1].text + "\n\n" + buf.text, + title_path=self._merge_title_paths( + [final[-1].title_path, buf.title_path] + ), + section_index=self._merge_section_indexes( + [final[-1].section_index, buf.section_index] + ), + ) else: final.append(buf) return final + @staticmethod + def _normalize_title_path(title_path: list[str]) -> list[str] | None: + path = [title.strip() for title in title_path if title and title.strip()] + return path or None + + @staticmethod + def _merge_title_paths(paths: list[list[str] | None]) -> list[str] | None: + non_empty_paths = [path for path in paths if path] + if not non_empty_paths: + return None + + common = list(non_empty_paths[0]) + for path in non_empty_paths[1:]: + prefix: list[str] = [] + for left, right in zip(common, path, strict=False): + if left != right: + break + prefix.append(left) + common = prefix + if not common: + return None + return common + + @staticmethod + def _merge_section_indexes(indexes: list[int | None]) -> int | None: + non_empty_indexes = [index for index in indexes if index is not None] + if not non_empty_indexes: + return None + first_index = non_empty_indexes[0] + if all(index == first_index for index in non_empty_indexes): + return first_index + return None + def _parse_sections(self, text: str) -> list[_Section]: """解析 Markdown 文本为章节列表 @@ -264,11 +1039,21 @@ def _parse_sections(self, text: str) -> list[_Section]: return [] sections: list[_Section] = [] + section_index = 0 # 处理第一个标题之前的内容(如果有) preamble = text[: headings[0]["start"]].strip() if preamble: - sections.append(_Section(heading_path=[], text=preamble, has_body=True)) + sections.append( + _Section( + heading_path=[], + title_path=[], + section_index=section_index, + text=preamble, + has_body=True, + ) + ) + section_index += 1 # 维护标题栈来追踪层级路径 heading_stack: list[dict] = [] @@ -297,14 +1082,18 @@ def _parse_sections(self, text: str) -> list[_Section]: # 构建标题路径 heading_path = [h["title"] for h in heading_stack[:-1]] + title_path = [h["title"] for h in heading_stack] sections.append( _Section( heading_path=heading_path, + title_path=title_path, + section_index=section_index, text=section_text, has_body=bool(body), ) ) + section_index += 1 return sections diff --git a/astrbot/core/knowledge_base/document_metadata.py b/astrbot/core/knowledge_base/document_metadata.py new file mode 100644 index 0000000000..4c78efe410 --- /dev/null +++ b/astrbot/core/knowledge_base/document_metadata.py @@ -0,0 +1,61 @@ +"""Helpers for knowledge-base document governance metadata.""" + +import hashlib +import re +import uuid +from pathlib import Path + +from .chunking.base import BaseChunker +from .parsers.base import BaseParser + +DEFAULT_PARSER_VERSION = "1" +DEFAULT_CHUNKER_VERSION = "1" + + +def build_content_hash(content: bytes | str | list[str]) -> str: + """Return a stable SHA256 hash for source content.""" + digest = hashlib.sha256() + if isinstance(content, bytes): + digest.update(content) + elif isinstance(content, str): + digest.update(content.encode("utf-8")) + else: + for chunk in content: + digest.update(chunk.encode("utf-8")) + digest.update(b"\x00") + return digest.hexdigest() + + +def get_parser_name(parser: BaseParser | None) -> str | None: + if parser is None: + return None + return parser.__class__.__name__ + + +def get_chunker_name(chunker: BaseChunker | None) -> str | None: + if chunker is None: + return None + return chunker.__class__.__name__ + + +def sanitize_source_filename(file_name: str | None, fallback_suffix: str = "") -> str: + """Return a filename safe for storage inside a KB-owned directory.""" + raw = (file_name or "").replace("\\", "/").split("/")[-1].replace("\x00", "") + safe = re.sub(r"[^A-Za-z0-9._ -]", "_", raw).strip(" .") + if not safe: + safe = f"document_{uuid.uuid4().hex[:8]}{fallback_suffix}" + return safe[:255] + + +def build_stored_source_path( + files_dir: Path, + *, + doc_id: str, + file_name: str, + file_type: str, +) -> Path: + suffix = Path(file_name).suffix + if not suffix and file_type: + suffix = f".{file_type}" + safe_name = sanitize_source_filename(file_name, fallback_suffix=suffix) + return files_dir / doc_id / safe_name diff --git a/astrbot/core/knowledge_base/kb_db_sqlite.py b/astrbot/core/knowledge_base/kb_db_sqlite.py index 2734ccb8d9..10f82e5635 100644 --- a/astrbot/core/knowledge_base/kb_db_sqlite.py +++ b/astrbot/core/knowledge_base/kb_db_sqlite.py @@ -1,8 +1,11 @@ +import asyncio +import json from contextlib import asynccontextmanager +from datetime import datetime, timezone from pathlib import Path from typing import TYPE_CHECKING -from sqlalchemy import delete, event, func, select, text, update +from sqlalchemy import delete, event, func, or_, select, text, update from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from sqlalchemy.pool import NullPool from sqlmodel import col, desc @@ -11,6 +14,7 @@ from astrbot.core.knowledge_base.models import ( BaseKBModel, KBDocument, + KBIngestionTask, KBMedia, KnowledgeBase, ) @@ -19,6 +23,8 @@ if TYPE_CHECKING: from astrbot.core.db.vec_db.faiss_impl import FaissVecDB +_UNSET = object() + def _configure_sqlite_connection(dbapi_connection, connection_record) -> None: cursor = dbapi_connection.cursor() @@ -106,6 +112,15 @@ async def migrate_to_v1(self) -> None: async with self.get_db() as session: session: AsyncSession async with session.begin(): + await self._ensure_column( + session, + table_name="knowledge_bases", + column_name="index_type", + column_sql="index_type TEXT DEFAULT 'flat'", + ) + await self._ensure_document_governance_columns(session) + await self._ensure_ingestion_task_table(session) + # 创建知识库表索引 await session.execute( text( @@ -157,6 +172,24 @@ async def migrate_to_v1(self) -> None: "ON kb_documents(created_at)", ), ) + await session.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_doc_content_hash " + "ON kb_documents(content_hash)", + ), + ) + await session.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_doc_status " + "ON kb_documents(status)", + ), + ) + await session.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_doc_parent_doc_id " + "ON kb_documents(parent_doc_id)", + ), + ) # 创建多媒体表索引 await session.execute( @@ -182,9 +215,126 @@ async def migrate_to_v1(self) -> None: "ON kb_media(media_type)", ), ) + await self._ensure_ingestion_task_indexes(session) await session.commit() + async def _ensure_column( + self, + session: AsyncSession, + *, + table_name: str, + column_name: str, + column_sql: str, + ) -> None: + """Add a column when upgrading an existing SQLite table.""" + result = await session.execute(text(f"PRAGMA table_xinfo({table_name})")) + columns = {row[1] for row in result.fetchall()} + if column_name in columns: + return + logger.info( + f"知识库数据库迁移: 为表 {table_name} 添加列 {column_name}", + ) + await session.execute(text(f"ALTER TABLE {table_name} ADD COLUMN {column_sql}")) + + async def _ensure_document_governance_columns( + self, + session: AsyncSession, + ) -> None: + columns = { + "source_type": "source_type TEXT NOT NULL DEFAULT 'file'", + "source_uri": "source_uri TEXT", + "content_hash": "content_hash VARCHAR(64)", + "parser_name": "parser_name VARCHAR(100)", + "parser_version": "parser_version VARCHAR(50)", + "chunker_name": "chunker_name VARCHAR(100)", + "chunker_version": "chunker_version VARCHAR(50)", + "status": "status TEXT NOT NULL DEFAULT 'ready'", + "error_stage": "error_stage VARCHAR(50)", + "error_message": "error_message TEXT", + "version": "version INTEGER NOT NULL DEFAULT 1", + "parent_doc_id": "parent_doc_id VARCHAR(36)", + "indexed_at": "indexed_at DATETIME", + } + for column_name, column_sql in columns.items(): + await self._ensure_column( + session, + table_name="kb_documents", + column_name=column_name, + column_sql=column_sql, + ) + + async def _ensure_ingestion_task_table(self, session: AsyncSession) -> None: + await session.execute( + text( + """ + CREATE TABLE IF NOT EXISTS kb_ingestion_tasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_id VARCHAR(36) NOT NULL UNIQUE, + kb_id VARCHAR(36) NOT NULL, + task_type VARCHAR(30) NOT NULL, + status VARCHAR(20) NOT NULL DEFAULT 'pending', + progress_stage VARCHAR(50), + progress_current INTEGER NOT NULL DEFAULT 0, + progress_total INTEGER NOT NULL DEFAULT 100, + progress TEXT, + result TEXT, + error TEXT, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL + ) + """, + ), + ) + + async def _ensure_ingestion_task_indexes(self, session: AsyncSession) -> None: + indexes = { + "idx_task_task_id": "task_id", + "idx_task_kb_id": "kb_id", + "idx_task_type": "task_type", + "idx_task_status": "status", + "idx_task_created_at": "created_at", + } + for index_name, column_name in indexes.items(): + await session.execute( + text( + f"CREATE INDEX IF NOT EXISTS {index_name} " + f"ON kb_ingestion_tasks({column_name})", + ), + ) + + @staticmethod + def _encode_json(value) -> str | None: + if value is None: + return None + return json.dumps(value, ensure_ascii=False, default=str) + + @staticmethod + def _decode_json(value: str | None): + if value is None: + return None + try: + return json.loads(value) + except json.JSONDecodeError: + return value + + @classmethod + def _task_to_dict(cls, task: KBIngestionTask) -> dict: + return { + "task_id": task.task_id, + "kb_id": task.kb_id, + "task_type": task.task_type, + "status": task.status, + "progress_stage": task.progress_stage, + "progress_current": task.progress_current, + "progress_total": task.progress_total, + "progress": cls._decode_json(task.progress), + "result": cls._decode_json(task.result), + "error": cls._decode_json(task.error), + "created_at": task.created_at.isoformat(), + "updated_at": task.updated_at.isoformat(), + } + async def close(self) -> None: """关闭数据库连接""" await self.engine.dispose() @@ -204,15 +354,22 @@ async def get_kb_by_name(self, kb_name: str) -> KnowledgeBase | None: result = await session.execute(stmt) return result.scalar_one_or_none() - async def list_kbs(self, offset: int = 0, limit: int = 100) -> list[KnowledgeBase]: + async def list_kbs( + self, + offset: int = 0, + limit: int | None = None, + ) -> list[KnowledgeBase]: """列出所有知识库""" async with self.get_db() as session: stmt = ( select(KnowledgeBase) .offset(offset) - .limit(limit) - .order_by(desc(KnowledgeBase.created_at)) + .order_by( + desc(KnowledgeBase.created_at), + ) ) + if limit is not None: + stmt = stmt.limit(limit) result = await session.execute(stmt) return list(result.scalars().all()) @@ -223,6 +380,146 @@ async def count_kbs(self) -> int: result = await session.execute(stmt) return result.scalar() or 0 + # ===== 任务查询 ===== + + async def create_ingestion_task( + self, + *, + task_id: str, + kb_id: str, + task_type: str, + status: str = "pending", + progress_stage: str | None = None, + progress_current: int = 0, + progress_total: int = 100, + progress: dict | None = None, + ) -> dict: + task = KBIngestionTask( + task_id=task_id, + kb_id=kb_id, + task_type=task_type, + status=status, + progress_stage=progress_stage, + progress_current=progress_current, + progress_total=progress_total, + progress=self._encode_json(progress), + ) + async with self.get_db() as session: + session.add(task) + await session.commit() + await session.refresh(task) + return self._task_to_dict(task) + + async def update_ingestion_task( + self, + task_id: str, + *, + status: str | object = _UNSET, + progress_stage: str | None | object = _UNSET, + progress_current: int | object = _UNSET, + progress_total: int | object = _UNSET, + progress: dict | None | object = _UNSET, + result: dict | None | object = _UNSET, + error: str | None | object = _UNSET, + ) -> dict | None: + async with self.get_db() as session: + stmt = select(KBIngestionTask).where( + col(KBIngestionTask.task_id) == task_id, + ) + query_result = await session.execute(stmt) + task = query_result.scalar_one_or_none() + if task is None: + return None + + if status is not _UNSET: + task.status = status # type: ignore[assignment] + if progress_stage is not _UNSET: + task.progress_stage = progress_stage # type: ignore[assignment] + if progress_current is not _UNSET: + task.progress_current = progress_current # type: ignore[assignment] + if progress_total is not _UNSET: + task.progress_total = progress_total # type: ignore[assignment] + if progress is not _UNSET: + task.progress = self._encode_json(progress) + if result is not _UNSET: + task.result = self._encode_json(result) + if error is not _UNSET: + task.error = self._encode_json(error) + task.updated_at = datetime.now(timezone.utc) + + session.add(task) + await session.commit() + await session.refresh(task) + return self._task_to_dict(task) + + async def get_ingestion_task(self, task_id: str) -> dict | None: + async with self.get_db() as session: + stmt = select(KBIngestionTask).where( + col(KBIngestionTask.task_id) == task_id, + ) + result = await session.execute(stmt) + task = result.scalar_one_or_none() + return self._task_to_dict(task) if task is not None else None + + @staticmethod + def _build_ingestion_task_conditions( + *, + kb_id: str | None = None, + status: str | None = None, + task_type: str | None = None, + ) -> list: + conditions = [] + if kb_id is not None: + conditions.append(col(KBIngestionTask.kb_id) == kb_id) + if status is not None: + conditions.append(col(KBIngestionTask.status) == status) + if task_type is not None: + conditions.append(col(KBIngestionTask.task_type) == task_type) + return conditions + + async def list_ingestion_tasks( + self, + *, + kb_id: str | None = None, + status: str | None = None, + task_type: str | None = None, + offset: int = 0, + limit: int = 100, + ) -> list[dict]: + conditions = self._build_ingestion_task_conditions( + kb_id=kb_id, + status=status, + task_type=task_type, + ) + + async with self.get_db() as session: + stmt = ( + select(KBIngestionTask) + .where(*conditions) + .offset(offset) + .limit(limit) + .order_by(desc(KBIngestionTask.created_at)) + ) + result = await session.execute(stmt) + return [self._task_to_dict(task) for task in result.scalars().all()] + + async def count_ingestion_tasks( + self, + *, + kb_id: str | None = None, + status: str | None = None, + task_type: str | None = None, + ) -> int: + conditions = self._build_ingestion_task_conditions( + kb_id=kb_id, + status=status, + task_type=task_type, + ) + async with self.get_db() as session: + stmt = select(func.count(col(KBIngestionTask.id))).where(*conditions) + result = await session.execute(stmt) + return result.scalar() or 0 + # ===== 文档查询 ===== async def get_document_by_id(self, doc_id: str) -> KBDocument | None: @@ -232,17 +529,70 @@ async def get_document_by_id(self, doc_id: str) -> KBDocument | None: result = await session.execute(stmt) return result.scalar_one_or_none() + async def get_document_by_content_hash( + self, + *, + kb_id: str, + content_hash: str, + ) -> KBDocument | None: + """Return an existing active document with the same source content hash.""" + async with self.get_db() as session: + stmt = ( + select(KBDocument) + .where( + col(KBDocument.kb_id) == kb_id, + col(KBDocument.content_hash) == content_hash, + col(KBDocument.status) != "failed", + ) + .order_by(desc(KBDocument.created_at)) + .limit(1) + ) + result = await session.execute(stmt) + return result.scalar_one_or_none() + + @staticmethod + def _build_document_filters( + *, + kb_id: str, + search: str | None = None, + status: str | None = None, + source_type: str | None = None, + ) -> list: + conditions = [col(KBDocument.kb_id) == kb_id] + if search: + pattern = f"%{search}%" + conditions.append( + or_( + col(KBDocument.doc_name).ilike(pattern), + col(KBDocument.file_type).ilike(pattern), + ), + ) + if status: + conditions.append(col(KBDocument.status) == status) + if source_type: + conditions.append(col(KBDocument.source_type) == source_type) + return conditions + async def list_documents_by_kb( self, kb_id: str, offset: int = 0, limit: int = 100, + search: str | None = None, + status: str | None = None, + source_type: str | None = None, ) -> list[KBDocument]: """列出知识库的所有文档""" async with self.get_db() as session: + conditions = self._build_document_filters( + kb_id=kb_id, + search=search, + status=status, + source_type=source_type, + ) stmt = ( select(KBDocument) - .where(col(KBDocument.kb_id) == kb_id) + .where(*conditions) .offset(offset) .limit(limit) .order_by(desc(KBDocument.created_at)) @@ -250,12 +600,22 @@ async def list_documents_by_kb( result = await session.execute(stmt) return list(result.scalars().all()) - async def count_documents_by_kb(self, kb_id: str) -> int: + async def count_documents_by_kb( + self, + kb_id: str, + search: str | None = None, + status: str | None = None, + source_type: str | None = None, + ) -> int: """统计知识库的文档数量""" async with self.get_db() as session: - stmt = select(func.count(col(KBDocument.id))).where( - col(KBDocument.kb_id) == kb_id, + conditions = self._build_document_filters( + kb_id=kb_id, + search=search, + status=status, + source_type=source_type, ) + stmt = select(func.count(col(KBDocument.id))).where(*conditions) result = await session.execute(stmt) return result.scalar() or 0 @@ -317,17 +677,98 @@ async def get_documents_with_metadata_batch( return metadata_map - async def delete_document_by_id(self, doc_id: str, vec_db: "FaissVecDB") -> None: + async def delete_document_by_id( + self, + doc_id: str, + vec_db: "FaissVecDB", + kb_id: str | None = None, + ) -> bool: """删除单个文档及其相关数据""" - # 在知识库表中删除 + doc = await self.get_document_by_id(doc_id) + if not doc or (kb_id is not None and doc.kb_id != kb_id): + return False + + metadata_filters = {"kb_doc_id": doc_id} + if kb_id is not None: + metadata_filters["kb_id"] = kb_id + + # 先删向量库;如果失败,保留 metadata 以便重试/修复。 + await vec_db.delete_documents(metadata_filters=metadata_filters) + async with self.get_db() as session, session.begin(): - # 删除文档记录 delete_stmt = delete(KBDocument).where(col(KBDocument.doc_id) == doc_id) + if kb_id is not None: + delete_stmt = delete_stmt.where(col(KBDocument.kb_id) == kb_id) await session.execute(delete_stmt) - await session.commit() + await session.execute(delete(KBMedia).where(col(KBMedia.doc_id) == doc_id)) + + return True + + async def delete_documents_by_ids( + self, + doc_ids: list[str], + vec_db: "FaissVecDB", + kb_id: str | None = None, + ) -> dict[str, bool]: + """批量删除文档及其向量数据。 + + 先删除向量数据,再删除 metadata;单个文档的 vec_db 删除失败 + 不影响其他文档(best-effort),失败项保留 metadata 以便重试。 + """ + if not doc_ids: + return {} + + requested_doc_ids = list(dict.fromkeys(doc_ids)) + results = dict.fromkeys(requested_doc_ids, False) + + candidates = requested_doc_ids + if kb_id is not None: + async with self.get_db() as session: + stmt = select(KBDocument.doc_id).where( + col(KBDocument.doc_id).in_(requested_doc_ids), + col(KBDocument.kb_id) == kb_id, + ) + result = await session.execute(stmt) + candidates = [row[0] for row in result.fetchall()] + + if not candidates: + return results + + async def _delete_one(doc_id: str) -> tuple[str, bool]: + metadata_filters = {"kb_doc_id": doc_id} + if kb_id is not None: + metadata_filters["kb_id"] = kb_id + try: + await vec_db.delete_documents(metadata_filters=metadata_filters) + return doc_id, True + except Exception as e: + logger.error( + f"删除文档 {doc_id} 的向量数据失败: {e}", + ) + return doc_id, False + + vec_results = await asyncio.gather( + *[_delete_one(doc_id) for doc_id in candidates], + ) + successful_doc_ids = [] + for doc_id, success in vec_results: + results[doc_id] = success + if success: + successful_doc_ids.append(doc_id) + + if successful_doc_ids: + async with self.get_db() as session, session.begin(): + delete_stmt = delete(KBDocument).where( + col(KBDocument.doc_id).in_(successful_doc_ids), + ) + if kb_id is not None: + delete_stmt = delete_stmt.where(col(KBDocument.kb_id) == kb_id) + await session.execute(delete_stmt) + await session.execute( + delete(KBMedia).where(col(KBMedia.doc_id).in_(successful_doc_ids)), + ) - # 在 vec db 中删除相关向量 - await vec_db.delete_documents(metadata_filters={"kb_doc_id": doc_id}) + return results # ===== 多媒体查询 ===== @@ -347,7 +788,7 @@ async def get_media_by_id(self, media_id: str) -> KBMedia | None: async def update_kb_stats(self, kb_id: str, vec_db: "FaissVecDB") -> None: """更新知识库统计信息""" - chunk_cnt = await vec_db.count_documents() + chunk_cnt = await vec_db.count_documents(metadata_filter={"kb_id": kb_id}) async with self.get_db() as session, session.begin(): update_stmt = ( @@ -363,3 +804,84 @@ async def update_kb_stats(self, kb_id: str, vec_db: "FaissVecDB") -> None: await session.execute(update_stmt) await session.commit() + + async def get_kb_stats(self, kb_id: str) -> dict | None: + """Return persisted document statistics for a knowledge base.""" + async with self.get_db() as session: + kb_result = await session.execute( + select(KnowledgeBase).where(col(KnowledgeBase.kb_id) == kb_id), + ) + kb = kb_result.scalar_one_or_none() + if kb is None: + return None + + status_result = await session.execute( + select(KBDocument.status, func.count(col(KBDocument.id))) + .where(col(KBDocument.kb_id) == kb_id) + .group_by(KBDocument.status), + ) + status_counts = { + status or "unknown": count for status, count in status_result.all() + } + + chunk_result = await session.execute( + select(func.coalesce(func.sum(col(KBDocument.chunk_count)), 0)).where( + col(KBDocument.kb_id) == kb_id, + ), + ) + document_chunk_count = int(chunk_result.scalar() or 0) + + media_result = await session.execute( + select(func.count(col(KBMedia.id))).where(col(KBMedia.kb_id) == kb_id), + ) + media_count = int(media_result.scalar() or 0) + source_file_count_result = await session.execute( + select(func.count(col(KBDocument.id))).where( + col(KBDocument.kb_id) == kb_id, + col(KBDocument.source_type) == "file", + col(KBDocument.file_path) != "", + ), + ) + source_file_count = int(source_file_count_result.scalar() or 0) + document_storage_result = await session.execute( + select(func.coalesce(func.sum(col(KBDocument.file_size)), 0)).where( + col(KBDocument.kb_id) == kb_id, + col(KBDocument.file_path) != "", + ), + ) + document_storage_bytes = int(document_storage_result.scalar() or 0) + media_storage_result = await session.execute( + select(func.coalesce(func.sum(col(KBMedia.file_size)), 0)).where( + col(KBMedia.kb_id) == kb_id, + ), + ) + media_storage_bytes = int(media_storage_result.scalar() or 0) + + document_count = sum(status_counts.values()) + ready_document_count = status_counts.get("ready", 0) + failed_document_count = status_counts.get("failed", 0) + pending_document_count = status_counts.get("pending", 0) + processing_document_count = sum( + status_counts.get(status, 0) + for status in ("parsing", "chunking", "embedding") + ) + + return { + "kb_id": kb.kb_id, + "kb_name": kb.kb_name, + "doc_count": kb.doc_count, + "chunk_count": kb.chunk_count, + "document_count": document_count, + "ready_document_count": ready_document_count, + "failed_document_count": failed_document_count, + "pending_document_count": pending_document_count, + "processing_document_count": processing_document_count, + "indexed_chunk_count": kb.chunk_count, + "document_chunk_count": document_chunk_count, + "media_count": media_count, + "source_file_count": source_file_count, + "storage_bytes": document_storage_bytes + media_storage_bytes, + "status_counts": status_counts, + "created_at": kb.created_at.isoformat(), + "updated_at": kb.updated_at.isoformat(), + } diff --git a/astrbot/core/knowledge_base/kb_helper.py b/astrbot/core/knowledge_base/kb_helper.py index c29e45876d..36a597bdf0 100644 --- a/astrbot/core/knowledge_base/kb_helper.py +++ b/astrbot/core/knowledge_base/kb_helper.py @@ -3,6 +3,7 @@ import re import time import uuid +from datetime import datetime, timezone from pathlib import Path from typing import TYPE_CHECKING @@ -11,7 +12,6 @@ from astrbot.core import logger from astrbot.core.db.vec_db.base import BaseVecDB from astrbot.core.exceptions import KnowledgeBaseUploadError -from astrbot.core.provider.manager import ProviderManager from astrbot.core.provider.provider import ( EmbeddingProvider, RerankProvider, @@ -20,17 +20,59 @@ Provider as LLMProvider, ) +from .capabilities import ( + DEFAULT_CHUNK_OVERLAP, + DEFAULT_CHUNK_SIZE, + DEFAULT_UPLOAD_BATCH_SIZE, + DEFAULT_UPLOAD_MAX_RETRIES, + DEFAULT_UPLOAD_TASKS_LIMIT, +) from .chunking.base import BaseChunker from .chunking.markdown import MarkdownChunker from .chunking.recursive import RecursiveCharacterChunker +from .document_metadata import ( + DEFAULT_CHUNKER_VERSION, + DEFAULT_PARSER_VERSION, + build_content_hash, + build_stored_source_path, + get_chunker_name, + get_parser_name, +) from .kb_db_sqlite import KBSQLiteDatabase from .models import KBDocument, KBMedia, KnowledgeBase -from .parsers.url_parser import extract_text_from_url +from .parsers.base import TextSegment +from .parsers.url_parser import URLExtractor, extract_text_from_url from .parsers.util import select_parser from .prompts import TEXT_REPAIR_SYSTEM_PROMPT if TYPE_CHECKING: from astrbot.core.db.vec_db.faiss_impl.vec_db import FaissVecDB + from astrbot.core.provider.manager import ProviderManager + + +DOCUMENT_REBUILD_PAGE_SIZE = 100 +CONSISTENCY_CHECK_PAGE_SIZE = 1000 +CONSISTENCY_REPAIR_TYPES = frozenset( + { + "orphan_vectors", + "chunk_count_mismatches", + }, +) +NON_PERSISTED_FAILURE_STAGES = frozenset({"deduplication"}) +MARKDOWN_AWARE_EXTENSIONS = frozenset( + { + ".adoc", + ".docx", + ".epub", + ".md", + ".markdown", + ".mdx", + ".mkd", + ".rst", + ".xls", + ".xlsx", + }, +) class RateLimiter: @@ -40,18 +82,20 @@ def __init__(self, max_rpm: int) -> None: self.max_per_minute = max_rpm self.interval = 60.0 / max_rpm if max_rpm > 0 else 0 self.last_call_time = 0 + self._lock = asyncio.Lock() async def __aenter__(self): if self.interval == 0: return - now = time.monotonic() - elapsed = now - self.last_call_time + async with self._lock: + now = time.monotonic() + elapsed = now - self.last_call_time - if elapsed < self.interval: - await asyncio.sleep(self.interval - elapsed) + if elapsed < self.interval: + await asyncio.sleep(self.interval - elapsed) - self.last_call_time = time.monotonic() + self.last_call_time = time.monotonic() async def __aexit__(self, exc_type, exc_val, exc_tb): pass @@ -114,6 +158,114 @@ def _compact_chunks(chunks: list[str]) -> list[str]: return [chunk.strip() for chunk in chunks if chunk and chunk.strip()] +def _estimate_text_tokens(text: str) -> int: + chinese_count = sum(1 for char in text if "\u4e00" <= char <= "\u9fff") + other_count = len(text) - chinese_count + return int(chinese_count * 0.6 + other_count * 0.3) + + +def _build_chunk_metadata( + *, + kb_id: str, + doc_id: str, + chunks_text: list[str], + chunk_ids: list[str], + chunk_extra_metadatas: list[dict] | None = None, +) -> list[dict]: + if chunk_extra_metadatas is not None and len(chunk_extra_metadatas) != len( + chunks_text + ): + raise ValueError("chunk_extra_metadatas length must match chunks_text length") + + metadatas = [] + start_offset = 0 + for idx, chunk_text in enumerate(chunks_text): + end_offset = start_offset + len(chunk_text) + metadata = { + "kb_id": kb_id, + "kb_doc_id": doc_id, + "chunk_index": idx, + "section_index": idx, + "content_hash": build_content_hash(chunk_text), + "char_count": len(chunk_text), + "token_count_estimate": _estimate_text_tokens(chunk_text), + "start_offset": start_offset, + "end_offset": end_offset, + "previous_chunk_id": chunk_ids[idx - 1] if idx > 0 else None, + "next_chunk_id": chunk_ids[idx + 1] if idx < len(chunk_ids) - 1 else None, + } + if chunk_extra_metadatas is not None: + metadata.update(chunk_extra_metadatas[idx]) + metadatas.append(metadata) + start_offset = end_offset + return metadatas + + +async def _chunk_text_with_metadata( + *, + chunker: BaseChunker, + text: str, + chunk_size: int, + chunk_overlap: int, + extra_metadata: dict | None = None, +) -> tuple[list[str], list[dict] | None]: + chunks_text = await chunker.chunk( + text, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + chunks_text = _compact_chunks(chunks_text) + if not chunks_text: + return [], [] if extra_metadata is not None else None + if extra_metadata is None: + return chunks_text, None + return chunks_text, [dict(extra_metadata) for _ in chunks_text] + + +async def _chunk_text_segments_with_metadata( + *, + chunker: BaseChunker, + text_segments: list[TextSegment], + chunk_size: int, + chunk_overlap: int, +) -> tuple[list[str], list[dict]]: + chunks_text: list[str] = [] + chunk_extra_metadatas: list[dict] = [] + for segment in text_segments: + segment_text = getattr(segment, "text", "") + segment_metadata = getattr(segment, "metadata", None) or {} + segment_chunks, segment_metadatas = await _chunk_text_with_metadata( + chunker=chunker, + text=segment_text, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + extra_metadata=segment_metadata, + ) + chunks_text.extend(segment_chunks) + chunk_extra_metadatas.extend(segment_metadatas or []) + return chunks_text, chunk_extra_metadatas + + +def _build_duplicate_document_error( + *, + file_name: str, + content_hash: str, + existing_doc: KBDocument, +) -> KnowledgeBaseUploadError: + return KnowledgeBaseUploadError( + stage="deduplication", + user_message=( + f"重复文档:{file_name} 与已存在文档 {existing_doc.doc_name} 内容相同。" + ), + details={ + "file_name": file_name, + "content_hash": content_hash, + "existing_doc_id": existing_doc.doc_id, + "existing_doc_name": existing_doc.doc_name, + }, + ) + + class KBHelper: vec_db: BaseVecDB kb: KnowledgeBase @@ -123,7 +275,7 @@ def __init__( self, kb_db: KBSQLiteDatabase, kb: KnowledgeBase, - provider_manager: ProviderManager, + provider_manager: "ProviderManager", kb_root_dir: str, chunker: BaseChunker, ) -> None: @@ -133,6 +285,8 @@ def __init__( self.kb_root_dir = kb_root_dir self.chunker = chunker self.init_error = None + self.init_retry_count = 0 + self.last_init_retry_at = 0.0 self.kb_dir = Path(self.kb_root_dir) / self.kb.kb_id self.kb_medias_dir = Path(self.kb_dir) / "medias" / self.kb.kb_id @@ -189,6 +343,7 @@ async def _ensure_vec_db(self) -> "FaissVecDB": index_store_path=str(self.kb_dir / "index.faiss"), embedding_provider=ep, rerank_provider=rp, + index_type=self.kb.index_type or "flat", ) await vec_db.initialize() self.vec_db = vec_db @@ -208,18 +363,162 @@ async def terminate(self) -> None: if hasattr(self, "vec_db") and self.vec_db: await self.vec_db.close() + async def _ensure_not_duplicate_document( + self, + *, + file_name: str, + content_hash: str | None, + ) -> None: + if not content_hash: + return + try: + existing_doc = await self.kb_db.get_document_by_content_hash( + kb_id=self.kb.kb_id, + content_hash=content_hash, + ) + except KnowledgeBaseUploadError: + raise + except Exception as exc: + raise KnowledgeBaseUploadError( + stage="deduplication", + user_message=("重复检测失败:无法确认文档是否已存在,请稍后重试。"), + details={"file_name": file_name, "content_hash": content_hash}, + ) from exc + if existing_doc is not None: + raise _build_duplicate_document_error( + file_name=file_name, + content_hash=content_hash, + existing_doc=existing_doc, + ) + + @staticmethod + def _get_upload_failure_stage(error: Exception) -> str: + if isinstance(error, KnowledgeBaseUploadError): + return error.stage + return "unknown" + + async def _persist_failed_document( + self, + *, + doc_id: str, + file_name: str, + file_type: str, + file_size: int, + stored_file_path: Path | None, + source_type: str, + source_uri: str, + content_hash: str | None, + parser_name: str | None, + chunker_name: str | None, + parent_doc_id: str | None, + document_version: int, + error: Exception, + ) -> bool: + """Persist a failed document record for ingestion diagnostics.""" + error_stage = self._get_upload_failure_stage(error) + if error_stage in NON_PERSISTED_FAILURE_STAGES: + return False + + failed_doc = KBDocument( + doc_id=doc_id, + kb_id=self.kb.kb_id, + doc_name=file_name, + file_type=file_type, + file_size=file_size, + file_path=str(stored_file_path) if stored_file_path else "", + source_type=source_type, + source_uri=source_uri, + content_hash=content_hash, + parser_name=parser_name, + parser_version=DEFAULT_PARSER_VERSION if parser_name else None, + chunker_name=chunker_name, + chunker_version=DEFAULT_CHUNKER_VERSION if chunker_name else None, + status="failed", + error_stage=error_stage, + error_message=str(error).strip() or error.__class__.__name__, + version=document_version, + parent_doc_id=parent_doc_id, + ) + + try: + async with self.kb_db.get_db() as session: + async with session.begin(): + session.add(failed_doc) + await session.commit() + await session.refresh(failed_doc) + except Exception as persist_err: + logger.warning( + f"记录失败文档 {doc_id} 的元数据失败: {persist_err}", + ) + return False + + try: + await self.kb_db.update_kb_stats( + kb_id=self.kb.kb_id, + vec_db=self.vec_db, # type: ignore[arg-type] + ) + await self.refresh_kb() + await self.refresh_document(doc_id) + except Exception as stats_err: + logger.warning( + f"刷新失败文档 {doc_id} 的知识库统计失败: {stats_err}", + ) + return True + + @staticmethod + def _build_url_file_name(url: str) -> str: + file_name = url.split("/")[-1] or f"document_from_{url}" + if not Path(file_name).suffix: + file_name += ".url" + return file_name + + async def _persist_failed_url_document( + self, + *, + url: str, + text_content: str | None, + parent_doc_id: str | None, + document_version: int, + error: Exception, + ) -> bool: + return await self._persist_failed_document( + doc_id=str(uuid.uuid4()), + file_name=self._build_url_file_name(url), + file_type="url", + file_size=len(text_content) if text_content else 0, + stored_file_path=None, + source_type="url", + source_uri=url, + content_hash=( + build_content_hash(text_content) if text_content is not None else None + ), + parser_name=URLExtractor.__name__, + chunker_name=get_chunker_name(self.chunker), + parent_doc_id=parent_doc_id, + document_version=document_version, + error=error, + ) + async def upload_document( self, file_name: str, file_content: bytes | None, file_type: str, - chunk_size: int = 512, - chunk_overlap: int = 50, - batch_size: int = 32, - tasks_limit: int = 3, - max_retries: int = 3, + chunk_size: int = DEFAULT_CHUNK_SIZE, + chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, + batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, + tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT, + max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES, progress_callback=None, pre_chunked_text: list[str] | None = None, + source_type: str | None = None, + source_uri: str | None = None, + source_content_hash: str | None = None, + source_parser_name: str | None = None, + source_chunker_name: str | None = None, + parent_doc_id: str | None = None, + document_version: int = 1, + skip_duplicate_check: bool = False, ) -> KBDocument: """上传并处理文档(带原子性保证和失败清理) @@ -242,20 +541,37 @@ async def upload_document( await self._ensure_vec_db() doc_id = str(uuid.uuid4()) media_paths: list[Path] = [] + stored_file_path: Path | None = None file_size = 0 - - # file_path = self.kb_files_dir / f"{doc_id}.{file_type}" - # async with aiofiles.open(file_path, "wb") as f: - # await f.write(file_content) + vectors_stored = False # 标记向量是否已写入, 用于失败回滚 + metadata_stored = False + failed_metadata_stored = False + effective_source_type = source_type or ( + "import" if pre_chunked_text is not None else "file" + ) + effective_source_uri = source_uri or file_name + content_hash: str | None = source_content_hash + parser_name: str | None = source_parser_name + chunker_name: str | None = source_chunker_name try: chunks_text = [] + chunk_extra_metadatas: list[dict] | None = None saved_media = [] if pre_chunked_text is not None: # 如果提供了预分块文本,直接使用 chunks_text = _compact_chunks(pre_chunked_text) file_size = sum(len(chunk) for chunk in chunks_text) + if content_hash is None: + content_hash = build_content_hash(chunks_text) + if chunker_name is None: + chunker_name = "pre_chunked" + if not skip_duplicate_check: + await self._ensure_not_duplicate_document( + file_name=file_name, + content_hash=content_hash, + ) logger.info(f"使用预分块文本进行上传,共 {len(chunks_text)} 个块。") else: # 否则,执行标准的文件解析和分块流程 @@ -265,6 +581,22 @@ async def upload_document( ) file_size = len(file_content) + content_hash = build_content_hash(file_content) + if not skip_duplicate_check: + await self._ensure_not_duplicate_document( + file_name=file_name, + content_hash=content_hash, + ) + + stored_file_path = build_stored_source_path( + self.kb_files_dir, + doc_id=doc_id, + file_name=file_name, + file_type=file_type, + ) + stored_file_path.parent.mkdir(parents=True, exist_ok=True) + async with aiofiles.open(stored_file_path, "wb") as f: + await f.write(file_content) # 阶段1: 解析文档 if progress_callback: @@ -272,6 +604,7 @@ async def upload_document( try: parser = await select_parser(f".{file_type}") + parser_name = get_parser_name(parser) parse_result = await parser.parse(file_content, file_name) except KnowledgeBaseUploadError: raise @@ -286,6 +619,7 @@ async def upload_document( ) from exc text_content = parse_result.text media_items = parse_result.media + text_segments = getattr(parse_result, "text_segments", None) if not text_content or not text_content.strip(): raise KnowledgeBaseUploadError( stage="parsing", @@ -316,24 +650,58 @@ async def upload_document( await progress_callback("chunking", 0, 100) try: - # 根据文件类型选择分块器:Markdown 文件使用结构感知分块 + # Use structure-aware chunking for Markdown and MarkItDown output. effective_chunker = self.chunker file_ext = Path(file_name).suffix.lower() if file_name else "" - if file_ext in (".md", ".markdown", ".mkd", ".mdx"): + if file_ext in MARKDOWN_AWARE_EXTENSIONS: effective_chunker = MarkdownChunker( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) logger.info( - f"检测到 Markdown 文件 '{file_name}',使用 MarkdownChunker 进行结构化分块" + f"检测到 Markdown 兼容文档 '{file_name}',使用 MarkdownChunker 进行结构化分块" ) - chunks_text = await effective_chunker.chunk( - text_content, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - ) - chunks_text = _compact_chunks(chunks_text) + chunker_name = get_chunker_name(effective_chunker) + if isinstance(effective_chunker, MarkdownChunker): + structured_chunks = await effective_chunker.chunk_with_metadata( + text_content, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + chunks_text = [] + chunk_extra_metadatas = [] + for chunk in structured_chunks: + chunk_text = chunk.text.strip() + if not chunk_text: + continue + chunks_text.append(chunk_text) + chunk_extra_metadatas.append( + { + "title_path": chunk.title_path, + "section_index": chunk.section_index, + } + ) + elif text_segments: + ( + chunks_text, + chunk_extra_metadatas, + ) = await _chunk_text_segments_with_metadata( + chunker=effective_chunker, + text_segments=text_segments, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + else: + ( + chunks_text, + chunk_extra_metadatas, + ) = await _chunk_text_with_metadata( + chunker=effective_chunker, + text=text_content, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) except KnowledgeBaseUploadError: raise except Exception as exc: @@ -363,16 +731,16 @@ async def upload_document( ) contents = [] - metadatas = [] for idx, chunk_text in enumerate(chunks_text): contents.append(chunk_text) - metadatas.append( - { - "kb_id": self.kb.kb_id, - "kb_doc_id": doc_id, - "chunk_index": idx, - }, - ) + chunk_ids = [str(uuid.uuid4()) for _ in chunks_text] + metadatas = _build_chunk_metadata( + kb_id=self.kb.kb_id, + doc_id=doc_id, + chunks_text=chunks_text, + chunk_ids=chunk_ids, + chunk_extra_metadatas=chunk_extra_metadatas, + ) if progress_callback: await progress_callback("chunking", 100, 100) @@ -386,11 +754,13 @@ async def embedding_progress_callback(current, total) -> None: await self.vec_db.insert_batch( contents=contents, metadatas=metadatas, + ids=chunk_ids, batch_size=batch_size, tasks_limit=tasks_limit, max_retries=max_retries, progress_callback=embedding_progress_callback, ) + vectors_stored = True except KnowledgeBaseUploadError: raise except Exception as exc: @@ -407,10 +777,20 @@ async def embedding_progress_callback(current, total) -> None: doc_name=file_name, file_type=file_type, file_size=file_size, - # file_path=str(file_path), - file_path="", + file_path=str(stored_file_path) if stored_file_path else "", + source_type=effective_source_type, + source_uri=effective_source_uri, + content_hash=content_hash, + parser_name=parser_name, + parser_version=DEFAULT_PARSER_VERSION if parser_name else None, + chunker_name=chunker_name, + chunker_version=DEFAULT_CHUNKER_VERSION if chunker_name else None, + status="ready", + indexed_at=datetime.now(timezone.utc), + version=document_version, + parent_doc_id=parent_doc_id, chunk_count=len(chunks_text), - media_count=0, + media_count=len(saved_media), ) try: async with self.kb_db.get_db() as session: @@ -419,6 +799,7 @@ async def embedding_progress_callback(current, total) -> None: for media in saved_media: session.add(media) await session.commit() + metadata_stored = True await session.refresh(doc) except KnowledgeBaseUploadError: @@ -453,15 +834,57 @@ async def embedding_progress_callback(current, total) -> None: logger.warning(f"上传文档失败: {e}", extra={"details": e.details}) else: logger.error(f"上传文档失败: {e}", exc_info=True) - # if file_path.exists(): - # file_path.unlink() - for media_path in media_paths: + # 回滚已写入的向量, 防止孤数据 + if vectors_stored and not metadata_stored: try: - if media_path.exists(): - media_path.unlink() - except Exception as me: - logger.warning(f"清理多媒体文件失败 {media_path}: {me}") + vec_db: FaissVecDB = self.vec_db # type: ignore + await vec_db.delete_documents( + metadata_filters={"kb_doc_id": doc_id}, + ) + logger.info(f"已清理文档 {doc_id} 的孤数据向量") + except Exception as cleanup_err: + logger.error( + f"清理文档 {doc_id} 向量回滚失败: {cleanup_err}", + ) + + if not metadata_stored: + failed_metadata_stored = await self._persist_failed_document( + doc_id=doc_id, + file_name=file_name, + file_type=file_type, + file_size=file_size, + stored_file_path=stored_file_path, + source_type=effective_source_type, + source_uri=effective_source_uri, + content_hash=content_hash, + parser_name=parser_name, + chunker_name=chunker_name, + parent_doc_id=parent_doc_id, + document_version=document_version, + error=e, + ) + + if ( + stored_file_path + and stored_file_path.exists() + and not metadata_stored + and not failed_metadata_stored + ): + try: + stored_file_path.unlink() + if stored_file_path.parent != self.kb_files_dir: + stored_file_path.parent.rmdir() + except Exception as fe: + logger.warning(f"清理原始文件失败 {stored_file_path}: {fe}") + + if not metadata_stored: + for media_path in media_paths: + try: + if media_path.exists(): + media_path.unlink() + except Exception as me: + logger.warning(f"清理多媒体文件失败 {media_path}: {me}") raise @@ -469,32 +892,379 @@ async def list_documents( self, offset: int = 0, limit: int = 100, + search: str | None = None, + status: str | None = None, + source_type: str | None = None, ) -> list[KBDocument]: """列出知识库的所有文档""" - docs = await self.kb_db.list_documents_by_kb(self.kb.kb_id, offset, limit) + docs = await self.kb_db.list_documents_by_kb( + self.kb.kb_id, + offset, + limit, + search, + status=status, + source_type=source_type, + ) return docs + async def count_documents( + self, + search: str | None = None, + status: str | None = None, + source_type: str | None = None, + ) -> int: + """统计知识库的所有文档数量""" + return await self.kb_db.count_documents_by_kb( + self.kb.kb_id, + search, + status=status, + source_type=source_type, + ) + async def get_document(self, doc_id: str) -> KBDocument | None: """获取单个文档""" doc = await self.kb_db.get_document_by_id(doc_id) + if doc and doc.kb_id != self.kb.kb_id: + return None return doc async def delete_document(self, doc_id: str) -> None: """删除单个文档及其相关数据""" - await self.kb_db.delete_document_by_id( + doc = await self.get_document(doc_id) + if not doc: + raise ValueError(f"无法找到 ID 为 {doc_id} 的文档") + media_items = await self.kb_db.list_media_by_doc(doc_id) + deleted = await self.kb_db.delete_document_by_id( doc_id=doc_id, vec_db=self.vec_db, # type: ignore + kb_id=self.kb.kb_id, ) + if not deleted: + raise ValueError(f"无法找到 ID 为 {doc_id} 的文档") + self._cleanup_document_files(doc, media_items) await self.kb_db.update_kb_stats( kb_id=self.kb.kb_id, vec_db=self.vec_db, # type: ignore ) await self.refresh_kb() + async def delete_documents(self, doc_ids: list[str]) -> dict[str, bool]: + """批量删除文档,单次更新统计。 + + vec_db 删除失败不阻塞其他文档(best-effort)。 + """ + docs_by_id = { + doc_id: doc + for doc_id in dict.fromkeys(doc_ids) + if (doc := await self.get_document(doc_id)) is not None + } + media_by_doc_id = { + doc_id: await self.kb_db.list_media_by_doc(doc_id) for doc_id in docs_by_id + } + results = await self.kb_db.delete_documents_by_ids( + doc_ids=doc_ids, + vec_db=self.vec_db, # type: ignore + kb_id=self.kb.kb_id, + ) + for doc_id, deleted in results.items(): + if deleted and doc_id in docs_by_id: + self._cleanup_document_files( + docs_by_id[doc_id], + media_by_doc_id.get(doc_id, []), + ) + await self.kb_db.update_kb_stats( + kb_id=self.kb.kb_id, + vec_db=self.vec_db, # type: ignore + ) + await self.refresh_kb() + return results + + async def rebuild_document( + self, + doc_id: str, + *, + chunk_size: int | None = None, + chunk_overlap: int | None = None, + batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, + tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT, + max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES, + progress_callback=None, + ) -> KBDocument: + doc = await self.get_document(doc_id) + if not doc: + raise ValueError(f"无法找到 ID 为 {doc_id} 的文档") + next_version = (doc.version or 1) + 1 + parent_doc_id = doc.parent_doc_id or doc.doc_id + effective_chunk_size = ( + chunk_size + if chunk_size is not None + else self.kb.chunk_size or DEFAULT_CHUNK_SIZE + ) + effective_chunk_overlap = ( + chunk_overlap + if chunk_overlap is not None + else self.kb.chunk_overlap or DEFAULT_CHUNK_OVERLAP + ) + + if doc.source_type == "file" and doc.file_path: + source_path = Path(doc.file_path).resolve(strict=False) + files_root = self.kb_files_dir.resolve(strict=False) + if not source_path.is_relative_to(files_root) or not source_path.exists(): + raise ValueError("无法找到可用于重建的原始文件") + + rebuilt_doc = await self.upload_document( + file_name=doc.doc_name, + file_content=source_path.read_bytes(), + file_type=doc.file_type, + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + source_type=doc.source_type, + source_uri=doc.source_uri or doc.doc_name, + parent_doc_id=parent_doc_id, + document_version=next_version, + skip_duplicate_check=True, + ) + elif doc.source_type == "url": + if not doc.source_uri: + raise ValueError("无法找到可用于重建的 URL 来源") + rebuilt_doc = await self.upload_from_url( + url=doc.source_uri, + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + parent_doc_id=parent_doc_id, + document_version=next_version, + skip_duplicate_check=True, + ) + elif doc.source_type == "import": + imported_chunks = await self._get_import_rebuild_chunks(doc.doc_id) + if not imported_chunks: + raise ValueError("无法找到可用于重建的导入文本块") + rebuilt_doc = await self.upload_document( + file_name=doc.doc_name, + file_content=None, + file_type=doc.file_type, + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + pre_chunked_text=imported_chunks, + source_type="import", + source_uri=doc.source_uri or doc.doc_name, + source_content_hash=build_content_hash(imported_chunks), + source_chunker_name=doc.chunker_name or "pre_chunked", + parent_doc_id=parent_doc_id, + document_version=next_version, + skip_duplicate_check=True, + ) + else: + raise ValueError("当前仅支持重建已保存原始文件、URL 或导入来源的文档") + + try: + await self.delete_document(doc_id) + except Exception as exc: + try: + await self.delete_document(rebuilt_doc.doc_id) + except Exception as cleanup_exc: + logger.error( + f"重建文档 {doc_id} 后清理新版本失败: {cleanup_exc}", + ) + raise KnowledgeBaseUploadError( + stage="rebuild", + user_message=( + "重建失败:新版本已生成,但替换旧文档时失败,已尝试回滚新版本。" + ), + details={ + "doc_id": doc_id, + "new_doc_id": rebuilt_doc.doc_id, + }, + ) from exc + return rebuilt_doc + + async def _get_import_rebuild_chunks(self, doc_id: str) -> list[str]: + chunks: list[dict] = [] + offset = 0 + while True: + page = await self.get_chunks_by_doc_id( + doc_id, + offset=offset, + limit=DOCUMENT_REBUILD_PAGE_SIZE, + ) + if not page: + break + chunks.extend(page) + if len(page) < DOCUMENT_REBUILD_PAGE_SIZE: + break + offset += DOCUMENT_REBUILD_PAGE_SIZE + + chunks.sort(key=lambda chunk: int(chunk.get("chunk_index") or 0)) + return [ + chunk["content"] + for chunk in chunks + if isinstance(chunk.get("content"), str) and chunk["content"].strip() + ] + + async def rebuild_all_documents( + self, + *, + chunk_size: int | None = None, + chunk_overlap: int | None = None, + batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, + tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT, + max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES, + progress_callback=None, + ) -> dict: + docs: list[KBDocument] = [] + offset = 0 + while True: + page = await self.list_documents( + offset=offset, + limit=DOCUMENT_REBUILD_PAGE_SIZE, + ) + docs.extend(page) + if len(page) < DOCUMENT_REBUILD_PAGE_SIZE: + break + offset += DOCUMENT_REBUILD_PAGE_SIZE + + rebuilt_docs = [] + failed_docs = [] + + total = len(docs) + for index, doc in enumerate(docs, start=1): + if progress_callback: + await progress_callback("rebuilding", index - 1, total) + try: + rebuilt = await self.rebuild_document( + doc.doc_id, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + ) + rebuilt_docs.append(rebuilt.model_dump()) + except Exception as e: + logger.error(f"重建文档 {doc.doc_id} 失败: {e}") + failed_docs.append( + { + "doc_id": doc.doc_id, + "doc_name": doc.doc_name, + "error": str(e), + }, + ) + + if progress_callback: + await progress_callback("rebuilding", total, total) + + return { + "rebuilt": rebuilt_docs, + "failed": failed_docs, + "total": total, + "success_count": len(rebuilt_docs), + "failed_count": len(failed_docs), + } + + async def rebuild_documents( + self, + doc_ids: list[str], + *, + chunk_size: int | None = None, + chunk_overlap: int | None = None, + batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, + tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT, + max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES, + progress_callback=None, + ) -> dict: + rebuilt_docs = [] + failed_docs = [] + normalized_doc_ids = list(dict.fromkeys(doc_ids)) + + total = len(normalized_doc_ids) + for index, doc_id in enumerate(normalized_doc_ids, start=1): + if progress_callback: + await progress_callback("rebuilding", index - 1, total) + try: + rebuilt = await self.rebuild_document( + doc_id, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + ) + rebuilt_docs.append(rebuilt.model_dump()) + except Exception as e: + logger.error(f"重建文档 {doc_id} 失败: {e}") + failed_doc = await self.get_document(doc_id) + failed_docs.append( + { + "doc_id": doc_id, + "doc_name": failed_doc.doc_name if failed_doc else doc_id, + "error": str(e), + }, + ) + + if progress_callback: + await progress_callback("rebuilding", total, total) + + return { + "rebuilt": rebuilt_docs, + "failed": failed_docs, + "total": total, + "success_count": len(rebuilt_docs), + "failed_count": len(failed_docs), + } + + def _cleanup_document_files( + self, + doc: KBDocument, + media_items: list[KBMedia], + ) -> None: + file_paths: list[Path] = [] + if doc.file_path: + file_paths.append(Path(doc.file_path)) + file_paths.extend(Path(media.file_path) for media in media_items) + + cleanup_roots = ( + self.kb_files_dir.resolve(strict=False), + self.kb_medias_dir.resolve(strict=False), + ) + for file_path in file_paths: + resolved_path = file_path.resolve(strict=False) + if not any(resolved_path.is_relative_to(root) for root in cleanup_roots): + logger.warning( + f"跳过清理知识库目录外文件: {resolved_path}", + ) + continue + try: + if resolved_path.exists(): + resolved_path.unlink() + parent = resolved_path.parent + if any(parent.is_relative_to(root) for root in cleanup_roots): + try: + parent.rmdir() + except OSError: + pass + except Exception as e: + logger.warning(f"清理知识库文件失败 {resolved_path}: {e}") + async def delete_chunk(self, chunk_id: str, doc_id: str) -> None: """删除单个文本块及其相关数据""" vec_db: FaissVecDB = self.vec_db # type: ignore - await vec_db.delete(chunk_id) + deleted = await vec_db.delete(chunk_id) + if not deleted: + raise ValueError(f"无法找到 ID 为 {chunk_id} 的文本块") await self.kb_db.update_kb_stats( kb_id=self.kb.kb_id, vec_db=self.vec_db, # type: ignore @@ -534,20 +1304,102 @@ async def get_chunks_by_doc_id( offset=offset, limit=limit, ) - result = [] - for chunk in chunks: - chunk_md = json.loads(chunk["metadata"]) - result.append( - { - "chunk_id": chunk["doc_id"], - "doc_id": chunk_md["kb_doc_id"], - "kb_id": chunk_md["kb_id"], - "chunk_index": chunk_md["chunk_index"], - "content": chunk["text"], - "char_count": len(chunk["text"]), - }, + return [self._format_chunk_response(chunk) for chunk in chunks] + + async def search_chunks_by_doc_id( + self, + doc_id: str, + search: str | None = None, + offset: int = 0, + limit: int = 100, + ) -> tuple[list[dict], int]: + """Search or list chunks for one document with a matching total.""" + if not search: + chunks = await self.get_chunks_by_doc_id( + doc_id=doc_id, + offset=offset, + limit=limit, + ) + return chunks, await self.get_chunk_count_by_doc_id(doc_id) + + vec_db: FaissVecDB = self.vec_db # type: ignore + search_documents = getattr(vec_db.document_storage, "search_documents", None) + if search_documents is None: + return [], 0 + + result = await search_documents( + search, + metadata_filters={"kb_doc_id": doc_id}, + offset=offset, + limit=limit, + ) + if result is None: + return [], 0 + chunks, total = result + return [self._format_chunk_response(chunk) for chunk in chunks], total + + @staticmethod + def _format_chunk_response(chunk: dict) -> dict: + chunk_md = json.loads(chunk["metadata"]) + char_count = chunk_md.get("char_count", len(chunk["text"])) + return { + "chunk_id": chunk["doc_id"], + "doc_id": chunk_md["kb_doc_id"], + "kb_id": chunk_md["kb_id"], + "chunk_index": chunk_md["chunk_index"], + "section_index": chunk_md.get("section_index"), + "content": chunk["text"], + "char_count": char_count, + "token_count_estimate": chunk_md.get("token_count_estimate"), + "content_hash": chunk_md.get("content_hash"), + "start_offset": chunk_md.get("start_offset"), + "end_offset": chunk_md.get("end_offset"), + "previous_chunk_id": chunk_md.get("previous_chunk_id"), + "next_chunk_id": chunk_md.get("next_chunk_id"), + "title_path": chunk_md.get("title_path"), + "page_number": chunk_md.get("page_number"), + "parent_chunk_id": chunk_md.get("parent_chunk_id"), + } + + async def get_chunk_by_id( + self, + chunk_id: str, + doc_id: str | None = None, + ) -> dict | None: + """获取单个文本块及其元数据""" + vec_db: FaissVecDB = self.vec_db # type: ignore + chunk = await vec_db.document_storage.get_document_by_doc_id(chunk_id) + if not chunk: + return None + formatted_chunk = self._format_chunk_response(chunk) + if doc_id and formatted_chunk["doc_id"] != doc_id: + return None + return formatted_chunk + + async def get_chunk_context(self, chunk_id: str, doc_id: str) -> dict: + """获取文本块和相邻上下文块""" + current = await self.get_chunk_by_id(chunk_id, doc_id) + if not current: + raise ValueError(f"无法找到 ID 为 {chunk_id} 的文本块") + + previous_chunk = None + next_chunk = None + if current.get("previous_chunk_id"): + previous_chunk = await self.get_chunk_by_id( + current["previous_chunk_id"], + doc_id, + ) + if current.get("next_chunk_id"): + next_chunk = await self.get_chunk_by_id( + current["next_chunk_id"], + doc_id, ) - return result + + return { + "previous": previous_chunk, + "current": current, + "next": next_chunk, + } async def get_chunk_count_by_doc_id(self, doc_id: str) -> int: """获取文档的块数量""" @@ -555,6 +1407,434 @@ async def get_chunk_count_by_doc_id(self, doc_id: str) -> int: count = await vec_db.count_documents(metadata_filter={"kb_doc_id": doc_id}) return count + async def check_consistency(self) -> dict: + """Return a read-only consistency report for document metadata and chunks.""" + docs = await self._list_all_documents_for_consistency() + doc_by_id = {doc.doc_id: doc for doc in docs} + stored_chunks = await self._list_all_chunks_for_consistency() + + chunks_by_doc_id: dict[str, list[dict]] = {} + orphan_vectors: list[dict] = [] + invalid_vector_metadata: list[dict] = [] + + for chunk in stored_chunks: + try: + metadata = self._parse_stored_chunk_metadata(chunk) + except ValueError as exc: + invalid_vector_metadata.append( + self._format_vector_issue(chunk, metadata_error=str(exc)), + ) + continue + + doc_id = metadata.get("kb_doc_id") + if not isinstance(doc_id, str) or not doc_id: + invalid_vector_metadata.append( + self._format_vector_issue( + chunk, + metadata=metadata, + metadata_error="missing kb_doc_id", + ), + ) + continue + + if doc_id not in doc_by_id: + orphan_vectors.append( + self._format_vector_issue(chunk, metadata=metadata), + ) + continue + + chunks_by_doc_id.setdefault(doc_id, []).append(chunk) + + missing_vectors: list[dict] = [] + chunk_count_mismatches: list[dict] = [] + for doc in docs: + expected_chunk_count = int(doc.chunk_count or 0) + actual_chunk_count = len(chunks_by_doc_id.get(doc.doc_id, [])) + if expected_chunk_count > 0 and actual_chunk_count == 0: + missing_vectors.append( + self._format_document_issue( + doc, + expected_chunk_count=expected_chunk_count, + actual_chunk_count=actual_chunk_count, + ), + ) + if expected_chunk_count != actual_chunk_count: + chunk_count_mismatches.append( + self._format_document_issue( + doc, + expected_chunk_count=expected_chunk_count, + actual_chunk_count=actual_chunk_count, + ), + ) + + missing_source_files, unsafe_source_paths, source_file_count = ( + self._check_source_file_consistency(docs) + ) + + status_counts: dict[str, int] = {} + for doc in docs: + status = doc.status or "unknown" + status_counts[status] = status_counts.get(status, 0) + 1 + + issues = { + "missing_vectors": missing_vectors, + "orphan_vectors": orphan_vectors, + "missing_source_files": missing_source_files, + "chunk_count_mismatches": chunk_count_mismatches, + "invalid_vector_metadata": invalid_vector_metadata, + "unsafe_source_paths": unsafe_source_paths, + } + issue_counts = {name: len(items) for name, items in issues.items()} + + return { + "kb_id": self.kb.kb_id, + "kb_name": self.kb.kb_name, + "checked_at": datetime.now(timezone.utc).isoformat(), + "summary": { + "sqlite_document_count": len(docs), + "ready_document_count": status_counts.get("ready", 0), + "failed_document_count": status_counts.get("failed", 0), + "document_chunk_count": sum(int(doc.chunk_count or 0) for doc in docs), + "indexed_chunk_count": len(stored_chunks), + "source_file_count": source_file_count, + "status_counts": status_counts, + **issue_counts, + "healthy": all(count == 0 for count in issue_counts.values()), + }, + "issues": issues, + } + + async def repair_consistency( + self, + repair_types: list[str] | None = None, + ) -> dict: + """Repair low-risk consistency issues and report skipped unsafe issues.""" + selected_repair_types = self._normalize_consistency_repair_types(repair_types) + pre_check = await self.check_consistency() + + repaired: list[dict] = [] + skipped: list[dict] = [] + failed: list[dict] = [] + + if "orphan_vectors" in selected_repair_types: + orphan_vectors = pre_check["issues"].get("orphan_vectors", []) + orphan_doc_ids = sorted( + { + issue.get("doc_id") + for issue in orphan_vectors + if isinstance(issue.get("doc_id"), str) and issue.get("doc_id") + }, + ) + for doc_id in orphan_doc_ids: + issue_count = sum( + 1 for issue in orphan_vectors if issue.get("doc_id") == doc_id + ) + try: + await self.vec_db.delete_documents( # type: ignore[attr-defined] + metadata_filters={ + "kb_id": self.kb.kb_id, + "kb_doc_id": doc_id, + }, + ) + repaired.append( + { + "type": "orphan_vectors", + "doc_id": doc_id, + "count": issue_count, + "action": "deleted_vectors", + }, + ) + except Exception as exc: + failed.append( + { + "type": "orphan_vectors", + "doc_id": doc_id, + "count": issue_count, + "action": "delete_vectors", + "error": str(exc), + }, + ) + + if "chunk_count_mismatches" in selected_repair_types: + for issue in pre_check["issues"].get("chunk_count_mismatches", []): + doc_id = issue.get("doc_id") + expected_count = int(issue.get("expected_chunk_count") or 0) + actual_count = int(issue.get("actual_chunk_count") or 0) + if not isinstance(doc_id, str) or not doc_id: + skipped.append( + { + "type": "chunk_count_mismatches", + "reason": "missing_doc_id", + "issue": issue, + }, + ) + continue + + if expected_count > actual_count: + skipped.append( + { + "type": "chunk_count_mismatches", + "doc_id": doc_id, + "reason": "missing_vectors_require_rebuild", + "expected_chunk_count": expected_count, + "actual_chunk_count": actual_count, + }, + ) + continue + + try: + await self.refresh_document(doc_id) + repaired.append( + { + "type": "chunk_count_mismatches", + "doc_id": doc_id, + "action": "refreshed_document_chunk_count", + "expected_chunk_count": expected_count, + "actual_chunk_count": actual_count, + }, + ) + except Exception as exc: + failed.append( + { + "type": "chunk_count_mismatches", + "doc_id": doc_id, + "action": "refresh_document", + "expected_chunk_count": expected_count, + "actual_chunk_count": actual_count, + "error": str(exc), + }, + ) + + for issue_type in ( + "missing_vectors", + "missing_source_files", + "invalid_vector_metadata", + "unsafe_source_paths", + ): + for issue in pre_check["issues"].get(issue_type, []): + skipped.append( + { + "type": issue_type, + "doc_id": issue.get("doc_id"), + "chunk_id": issue.get("chunk_id"), + "reason": self._get_consistency_repair_skip_reason( + issue_type, + ), + "issue": issue, + }, + ) + + if repaired or failed: + await self.kb_db.update_kb_stats( + kb_id=self.kb.kb_id, + vec_db=self.vec_db, # type: ignore + ) + await self.refresh_kb() + + post_check = await self.check_consistency() + return { + "kb_id": self.kb.kb_id, + "kb_name": self.kb.kb_name, + "repaired_at": datetime.now(timezone.utc).isoformat(), + "repair_types": selected_repair_types, + "summary": { + "repaired_count": len(repaired), + "skipped_count": len(skipped), + "failed_count": len(failed), + "healthy_after_repair": post_check["summary"]["healthy"], + }, + "actions": { + "repaired": repaired, + "skipped": skipped, + "failed": failed, + }, + "pre_check": pre_check, + "post_check": post_check, + } + + @staticmethod + def _normalize_consistency_repair_types( + repair_types: list[str] | None, + ) -> list[str]: + if repair_types is None: + return sorted(CONSISTENCY_REPAIR_TYPES) + + normalized = list( + dict.fromkeys( + repair_type.strip() + for repair_type in repair_types + if isinstance(repair_type, str) and repair_type.strip() + ), + ) + invalid_types = sorted(set(normalized) - CONSISTENCY_REPAIR_TYPES) + if invalid_types: + raise ValueError( + f"不支持的一致性修复类型: {', '.join(invalid_types)}", + ) + return normalized + + @staticmethod + def _get_consistency_repair_skip_reason(issue_type: str) -> str: + skip_reasons = { + "missing_vectors": "document_rebuild_required", + "missing_source_files": "source_file_missing_manual_action_required", + "invalid_vector_metadata": "invalid_metadata_manual_action_required", + "unsafe_source_paths": "unsafe_source_path_manual_action_required", + } + return skip_reasons.get(issue_type, "manual_action_required") + + async def _list_all_documents_for_consistency(self) -> list[KBDocument]: + return await self._collect_paginated_documents( + page_size=CONSISTENCY_CHECK_PAGE_SIZE, + ) + + async def _list_all_chunks_for_consistency(self) -> list[dict]: + return await self._collect_paginated_vector_documents( + page_size=CONSISTENCY_CHECK_PAGE_SIZE, + unsupported_message="当前知识库存储后端不支持一致性检查", + ) + + @staticmethod + def _parse_stored_chunk_metadata(chunk: dict) -> dict: + raw_metadata = chunk.get("metadata") + if raw_metadata is None: + return {} + if isinstance(raw_metadata, dict): + return raw_metadata + try: + metadata = json.loads(raw_metadata) + except (TypeError, json.JSONDecodeError) as exc: + raise ValueError("invalid metadata JSON") from exc + if not isinstance(metadata, dict): + raise ValueError("metadata must be a JSON object") + return metadata + + @staticmethod + def _format_vector_issue( + chunk: dict, + *, + metadata: dict | None = None, + metadata_error: str | None = None, + ) -> dict: + issue = { + "chunk_id": chunk.get("doc_id"), + "storage_id": chunk.get("id"), + } + if metadata: + issue.update( + { + "doc_id": metadata.get("kb_doc_id"), + "kb_id": metadata.get("kb_id"), + "chunk_index": metadata.get("chunk_index"), + }, + ) + if metadata_error: + issue["metadata_error"] = metadata_error + return issue + + @staticmethod + def _format_document_issue( + doc: KBDocument, + *, + expected_chunk_count: int | None = None, + actual_chunk_count: int | None = None, + reason: str | None = None, + ) -> dict: + issue = { + "doc_id": doc.doc_id, + "doc_name": doc.doc_name, + "status": doc.status, + "source_type": doc.source_type, + "file_path": doc.file_path, + } + if expected_chunk_count is not None: + issue["expected_chunk_count"] = expected_chunk_count + if actual_chunk_count is not None: + issue["actual_chunk_count"] = actual_chunk_count + if reason: + issue["reason"] = reason + return issue + + def _check_source_file_consistency( + self, + docs: list[KBDocument], + ) -> tuple[list[dict], list[dict], int]: + missing_source_files: list[dict] = [] + unsafe_source_paths: list[dict] = [] + source_file_count = 0 + files_root = self.kb_files_dir.resolve(strict=False) + + for doc in docs: + if doc.source_type != "file": + continue + + if not doc.file_path: + if doc.status == "ready": + missing_source_files.append( + self._format_document_issue(doc, reason="empty_file_path"), + ) + continue + + file_path = Path(doc.file_path).resolve(strict=False) + if not file_path.is_relative_to(files_root): + unsafe_source_paths.append( + self._format_document_issue( + doc, + reason="outside_kb_files_dir", + ), + ) + continue + if file_path.exists(): + source_file_count += 1 + else: + missing_source_files.append( + self._format_document_issue(doc, reason="not_found"), + ) + + return missing_source_files, unsafe_source_paths, source_file_count + + async def _collect_paginated_documents(self, *, page_size: int) -> list[KBDocument]: + docs: list[KBDocument] = [] + offset = 0 + while True: + page = await self.list_documents( + offset=offset, + limit=page_size, + ) + docs.extend(page) + if len(page) < page_size: + break + offset += page_size + return docs + + async def _collect_paginated_vector_documents( + self, + *, + page_size: int, + unsupported_message: str, + ) -> list[dict]: + document_storage = getattr(self.vec_db, "document_storage", None) + get_documents = getattr(document_storage, "get_documents", None) + if get_documents is None: + raise ValueError(unsupported_message) + + chunks: list[dict] = [] + offset = 0 + while True: + page_result = get_documents( + metadata_filters={"kb_id": self.kb.kb_id}, + offset=offset, + limit=page_size, + ) + if not hasattr(page_result, "__await__"): + raise ValueError(unsupported_message) + page = await page_result + chunks.extend(page) + if len(page) < page_size: + break + offset += page_size + return chunks + async def _save_media( self, doc_id: str, @@ -589,14 +1869,17 @@ async def _save_media( async def upload_from_url( self, url: str, - chunk_size: int = 512, - chunk_overlap: int = 50, - batch_size: int = 32, - tasks_limit: int = 3, - max_retries: int = 3, + chunk_size: int = DEFAULT_CHUNK_SIZE, + chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, + batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, + tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT, + max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES, progress_callback=None, enable_cleaning: bool = False, cleaning_provider_id: str | None = None, + parent_doc_id: str | None = None, + document_version: int = 1, + skip_duplicate_check: bool = False, ) -> KBDocument: """从 URL 上传并处理文档(带原子性保证和失败清理) Args: @@ -616,52 +1899,100 @@ async def upload_from_url( ValueError: 如果 URL 为空或无法提取内容 IOError: 如果网络请求失败 """ - # 获取 Tavily API 密钥 - config = self.prov_mgr.acm.default_conf - tavily_keys = config.get("provider_settings", {}).get( - "websearch_tavily_key", [] - ) - if not tavily_keys: - raise ValueError( - "Error: Tavily API key is not configured in provider_settings." + text_content: str | None = None + try: + # 获取 Tavily API 密钥 + config = self.prov_mgr.acm.default_conf + tavily_keys = config.get("provider_settings", {}).get( + "websearch_tavily_key", [] ) + if not tavily_keys: + raise KnowledgeBaseUploadError( + stage="configuration", + user_message=( + "URL 导入失败:Tavily API key 未配置。" + "请先在 provider_settings 中配置 websearch_tavily_key。" + ), + details={"url": url}, + ) - # 阶段1: 从 URL 提取内容 - if progress_callback: - await progress_callback("extracting", 0, 100) + # 阶段1: 从 URL 提取内容 + if progress_callback: + await progress_callback("extracting", 0, 100) - try: - text_content = await extract_text_from_url(url, tavily_keys) - except Exception as e: - logger.error(f"Failed to extract content from URL {url}: {e}") - raise OSError(f"Failed to extract content from URL {url}: {e}") from e + try: + text_content = await extract_text_from_url(url, tavily_keys) + except KnowledgeBaseUploadError: + raise + except Exception as e: + logger.error(f"Failed to extract content from URL {url}: {e}") + raise KnowledgeBaseUploadError( + stage="extracting", + user_message=( + "URL 导入失败:无法提取网页内容。" + "请确认 URL 可访问且 Tavily 配置有效。" + ), + details={"url": url}, + ) from e - if not text_content: - raise ValueError(f"No content extracted from URL: {url}") + if not text_content or not text_content.strip(): + raise KnowledgeBaseUploadError( + stage="extracting", + user_message=( + "URL 导入失败:未能从网页中提取可索引文本。" + "请确认页面存在正文内容,或尝试更换 URL。" + ), + details={"url": url}, + ) - if progress_callback: - await progress_callback("extracting", 100, 100) + if progress_callback: + await progress_callback("extracting", 100, 100) - # 阶段2: (可选)清洗内容并分块 - final_chunks = await self._clean_and_rechunk_content( - content=text_content, - url=url, - progress_callback=progress_callback, - enable_cleaning=enable_cleaning, - cleaning_provider_id=cleaning_provider_id, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - ) + # 阶段2: (可选)清洗内容并分块 + try: + final_chunks = await self._clean_and_rechunk_content( + content=text_content, + url=url, + progress_callback=progress_callback, + enable_cleaning=enable_cleaning, + cleaning_provider_id=cleaning_provider_id, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + except KnowledgeBaseUploadError: + raise + except Exception as e: + stage = "cleaning" if enable_cleaning else "chunking" + raise KnowledgeBaseUploadError( + stage=stage, + user_message=( + "URL 导入失败:网页内容切分失败。" + "请稍后重试,或调整分块参数后再次导入。" + ), + details={"url": url}, + ) from e - if enable_cleaning and not final_chunks: - raise ValueError( - "内容清洗后未提取到有效文本。请尝试关闭内容清洗功能,或更换更高性能的LLM模型后重试。" + if enable_cleaning and not final_chunks: + raise KnowledgeBaseUploadError( + stage="cleaning", + user_message=( + "URL 导入失败:内容清洗后未提取到有效文本。" + "请尝试关闭内容清洗功能,或更换更高性能的 LLM 模型后重试。" + ), + details={"url": url}, + ) + except Exception as e: + await self._persist_failed_url_document( + url=url, + text_content=text_content, + parent_doc_id=parent_doc_id, + document_version=document_version, + error=e, ) + raise # 创建一个虚拟文件名 - file_name = url.split("/")[-1] or f"document_from_{url}" - if not Path(file_name).suffix: - file_name += ".url" + file_name = self._build_url_file_name(url) # 复用现有的 upload_document 方法,但传入预分块文本 return await self.upload_document( @@ -675,6 +2006,14 @@ async def upload_from_url( max_retries=max_retries, progress_callback=progress_callback, pre_chunked_text=final_chunks, + source_type="url", + source_uri=url, + source_content_hash=build_content_hash(text_content), + source_parser_name=URLExtractor.__name__, + source_chunker_name=get_chunker_name(self.chunker), + parent_doc_id=parent_doc_id, + document_version=document_version, + skip_duplicate_check=skip_duplicate_check, ) async def _clean_and_rechunk_content( @@ -685,8 +2024,8 @@ async def _clean_and_rechunk_content( enable_cleaning: bool = False, cleaning_provider_id: str | None = None, repair_max_rpm: int = 60, - chunk_size: int = 512, - chunk_overlap: int = 50, + chunk_size: int = DEFAULT_CHUNK_SIZE, + chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, ) -> list[str]: """ 对从 URL 获取的内容进行清洗、修复、翻译和重新分块。 diff --git a/astrbot/core/knowledge_base/kb_mgr.py b/astrbot/core/knowledge_base/kb_mgr.py index 3285d42c79..dc1dab016e 100644 --- a/astrbot/core/knowledge_base/kb_mgr.py +++ b/astrbot/core/knowledge_base/kb_mgr.py @@ -1,22 +1,80 @@ +import asyncio +import time from pathlib import Path +from typing import TYPE_CHECKING + +from sqlalchemy import delete +from sqlmodel import col from astrbot.core import logger -from astrbot.core.provider.manager import ProviderManager from astrbot.core.utils.astrbot_path import get_astrbot_knowledge_base_path # from .chunking.fixed_size import FixedSizeChunker +from .capabilities import ( + DEFAULT_CHUNK_OVERLAP, + DEFAULT_CHUNK_SIZE, + DEFAULT_INDEX_TYPE, + DEFAULT_TOP_K_DENSE, + DEFAULT_TOP_K_SPARSE, + DEFAULT_TOP_M_FINAL, + DEFAULT_UPLOAD_BATCH_SIZE, + DEFAULT_UPLOAD_MAX_RETRIES, + DEFAULT_UPLOAD_TASKS_LIMIT, +) from .chunking.recursive import RecursiveCharacterChunker from .kb_db_sqlite import KBSQLiteDatabase from .kb_helper import KBHelper -from .models import KBDocument, KnowledgeBase +from .models import ( + KBDocument, + KBMedia, + KnowledgeBase, +) from .retrieval.manager import RetrievalManager, RetrievalResult from .retrieval.rank_fusion import RankFusion from .retrieval.sparse_retriever import SparseRetriever +if TYPE_CHECKING: + from astrbot.core.provider.manager import ProviderManager + FILES_PATH = get_astrbot_knowledge_base_path() DB_PATH = Path(FILES_PATH) / "kb.db" """Knowledge Base storage root directory""" CHUNKER = RecursiveCharacterChunker() +_UNSET = object() +INIT_RETRY_COOLDOWN_SECONDS = 60.0 +INIT_RETRY_MAX_ATTEMPTS = 3 +VALID_INDEX_TYPES = {"flat", "hnsw"} + + +def _validate_kb_options( + *, + chunk_size: int | None, + chunk_overlap: int | None, + top_k_dense: int | None, + top_k_sparse: int | None, + top_m_final: int | None, + index_type: str | None, +) -> None: + if chunk_size is not None and chunk_size <= 0: + raise ValueError("chunk_size 必须大于 0") + if chunk_overlap is not None and chunk_overlap < 0: + raise ValueError("chunk_overlap 不能为负数") + if ( + chunk_size is not None + and chunk_overlap is not None + and chunk_overlap >= chunk_size + ): + raise ValueError("chunk_overlap 必须小于 chunk_size") + if top_k_dense is not None and top_k_dense <= 0: + raise ValueError("top_k_dense 必须大于 0") + if top_k_sparse is not None and top_k_sparse <= 0: + raise ValueError("top_k_sparse 必须大于 0") + if top_m_final is not None and top_m_final <= 0: + raise ValueError("top_m_final 必须大于 0") + if index_type is not None and index_type not in VALID_INDEX_TYPES: + raise ValueError( + f"index_type 必须是 {', '.join(sorted(VALID_INDEX_TYPES))} 之一" + ) class KnowledgeBaseManager: @@ -25,13 +83,86 @@ class KnowledgeBaseManager: def __init__( self, - provider_manager: ProviderManager, + provider_manager: "ProviderManager", ) -> None: DB_PATH.parent.mkdir(parents=True, exist_ok=True) self.provider_manager = provider_manager self._session_deleted_callback_registered = False self.kb_insts: dict[str, KBHelper] = {} + self._kb_name_index: dict[str, str] = {} + self._kb_instances_lock = asyncio.Lock() + + def _ensure_kb_name_index(self) -> None: + if not hasattr(self, "kb_insts"): + self.kb_insts = {} + if not hasattr(self, "_kb_name_index"): + self._kb_name_index = {} + known_ids = set(self.kb_insts) + self._kb_name_index = { + name: kb_id + for name, kb_id in self._kb_name_index.items() + if kb_id in known_ids + } + for kb_id, kb_helper in self.kb_insts.items(): + self._kb_name_index[kb_helper.kb.kb_name] = kb_id + + def _ensure_kb_instances_lock(self) -> asyncio.Lock: + if not hasattr(self, "_kb_instances_lock"): + self._kb_instances_lock = asyncio.Lock() + return self._kb_instances_lock + + def _set_kb_instance(self, kb_id: str, kb_helper: KBHelper) -> None: + self._ensure_kb_name_index() + self.kb_insts[kb_id] = kb_helper + self._kb_name_index = { + name: indexed_kb_id + for name, indexed_kb_id in self._kb_name_index.items() + if indexed_kb_id != kb_id + } + self._kb_name_index[kb_helper.kb.kb_name] = kb_id + + def _get_kb_unlocked(self, kb_id: str) -> KBHelper | None: + if not hasattr(self, "kb_insts"): + self.kb_insts = {} + return self.kb_insts.get(kb_id) + + def _can_retry_helper_init(self, kb_helper: KBHelper) -> bool: + if not kb_helper.init_error: + return False + retry_count = getattr(kb_helper, "init_retry_count", 0) + if retry_count >= INIT_RETRY_MAX_ATTEMPTS: + return False + last_retry_at = getattr(kb_helper, "last_init_retry_at", 0.0) + return time.monotonic() - last_retry_at >= INIT_RETRY_COOLDOWN_SECONDS + + async def _retry_helper_init_if_due(self, kb_helper: KBHelper) -> None: + if not self._can_retry_helper_init(kb_helper): + return + + kb_helper.init_retry_count = getattr(kb_helper, "init_retry_count", 0) + 1 + kb_helper.last_init_retry_at = time.monotonic() + try: + await kb_helper.initialize() + kb_helper.init_error = None + kb_helper.init_retry_count = 0 + kb_helper.last_init_retry_at = 0.0 + except Exception as e: + kb_helper.init_error = str(e) + logger.warning( + f"知识库 {kb_helper.kb.kb_name}({kb_helper.kb.kb_id}) " + f"第 {kb_helper.init_retry_count} 次重新初始化失败: {e}", + exc_info=True, + ) + + def _remove_kb_instance(self, kb_id: str) -> None: + self._ensure_kb_name_index() + self.kb_insts.pop(kb_id, None) + self._kb_name_index = { + name: indexed_kb_id + for name, indexed_kb_id in self._kb_name_index.items() + if indexed_kb_id != kb_id + } async def initialize(self) -> None: """初始化知识库模块""" @@ -76,11 +207,13 @@ async def load_kbs(self) -> None: await kb_helper.initialize() except Exception as e: kb_helper.init_error = str(e) + kb_helper.init_retry_count = 0 + kb_helper.last_init_retry_at = time.monotonic() logger.error( f"知识库 {record.kb_name}({record.kb_id}) 初始化失败: {e}", exc_info=True, ) - self.kb_insts[record.kb_id] = kb_helper + self._set_kb_instance(record.kb_id, kb_helper) async def create_kb( self, @@ -94,206 +227,335 @@ async def create_kb( top_k_dense: int | None = None, top_k_sparse: int | None = None, top_m_final: int | None = None, + index_type: str | None = None, ) -> KBHelper: """创建新的知识库实例""" if embedding_provider_id is None: raise ValueError("创建知识库时必须提供embedding_provider_id") + effective_chunk_size = ( + chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE + ) + effective_chunk_overlap = ( + chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP + ) + effective_top_k_dense = ( + top_k_dense if top_k_dense is not None else DEFAULT_TOP_K_DENSE + ) + effective_top_k_sparse = ( + top_k_sparse if top_k_sparse is not None else DEFAULT_TOP_K_SPARSE + ) + effective_top_m_final = ( + top_m_final if top_m_final is not None else DEFAULT_TOP_M_FINAL + ) + effective_index_type = ( + index_type if index_type is not None else DEFAULT_INDEX_TYPE + ) + _validate_kb_options( + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + top_k_dense=effective_top_k_dense, + top_k_sparse=effective_top_k_sparse, + top_m_final=effective_top_m_final, + index_type=effective_index_type, + ) kb = KnowledgeBase( kb_name=kb_name, description=description, emoji=emoji or "📚", embedding_provider_id=embedding_provider_id, rerank_provider_id=rerank_provider_id, - chunk_size=chunk_size if chunk_size is not None else 512, - chunk_overlap=chunk_overlap if chunk_overlap is not None else 50, - top_k_dense=top_k_dense if top_k_dense is not None else 50, - top_k_sparse=top_k_sparse if top_k_sparse is not None else 50, - top_m_final=top_m_final if top_m_final is not None else 5, + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + top_k_dense=effective_top_k_dense, + top_k_sparse=effective_top_k_sparse, + top_m_final=effective_top_m_final, + index_type=effective_index_type, ) + kb_helper: KBHelper | None = None try: - async with self.kb_db.get_db() as session: - session.add(kb) - await session.flush() - - kb_helper = KBHelper( - kb_db=self.kb_db, - kb=kb, - provider_manager=self.provider_manager, - kb_root_dir=FILES_PATH, - chunker=CHUNKER, - ) - await kb_helper.initialize() - await session.commit() - self.kb_insts[kb.kb_id] = kb_helper - return kb_helper + async with self._ensure_kb_instances_lock(): + async with self.kb_db.get_db() as session: + session.add(kb) + await session.flush() + + kb_helper = KBHelper( + kb_db=self.kb_db, + kb=kb, + provider_manager=self.provider_manager, + kb_root_dir=FILES_PATH, + chunker=CHUNKER, + ) + await kb_helper.initialize() + await session.commit() + self._set_kb_instance(kb.kb_id, kb_helper) + return kb_helper except Exception as e: + if kb_helper is not None: + try: + await kb_helper.delete_vec_db() + except Exception as cleanup_err: + logger.warning( + f"创建知识库 {kb_name} 失败后清理文件目录失败: {cleanup_err}", + ) if "kb_name" in str(e): raise ValueError(f"知识库名称 '{kb_name}' 已存在") raise async def get_kb(self, kb_id: str) -> KBHelper | None: """获取知识库实例""" - if kb_id in self.kb_insts: - return self.kb_insts[kb_id] + async with self._ensure_kb_instances_lock(): + kb_helper = self._get_kb_unlocked(kb_id) + if kb_helper is not None: + await self._retry_helper_init_if_due(kb_helper) + return kb_helper async def get_kb_by_name(self, kb_name: str) -> KBHelper | None: """通过名称获取知识库实例""" - for kb_helper in self.kb_insts.values(): - if kb_helper.kb.kb_name == kb_name: - return kb_helper - return None + async with self._ensure_kb_instances_lock(): + self._ensure_kb_name_index() + kb_id = self._kb_name_index.get(kb_name) + if kb_id: + return self.kb_insts.get(kb_id) + return None async def delete_kb(self, kb_id: str) -> bool: """删除知识库实例""" - kb_helper = await self.get_kb(kb_id) - if not kb_helper: - return False + async with self._ensure_kb_instances_lock(): + kb_helper = self._get_kb_unlocked(kb_id) + if not kb_helper: + return False - await kb_helper.delete_vec_db() - async with self.kb_db.get_db() as session: - await session.delete(kb_helper.kb) - await session.commit() + async with self.kb_db.get_db() as session: + await session.execute( + delete(KBMedia).where(col(KBMedia.kb_id) == kb_id) + ) + await session.execute( + delete(KBDocument).where(col(KBDocument.kb_id) == kb_id) + ) + await session.execute( + delete(KnowledgeBase).where(col(KnowledgeBase.kb_id) == kb_id) + ) + await session.commit() - self.kb_insts.pop(kb_id, None) - return True + try: + await kb_helper.delete_vec_db() + except Exception as e: + logger.warning( + f"知识库 {kb_id} 数据库记录已删除,但文件目录清理失败: {e}" + ) + + self._remove_kb_instance(kb_id) + return True async def list_kbs(self) -> list[KnowledgeBase]: """列出所有知识库实例""" - kbs = [kb_helper.kb for kb_helper in self.kb_insts.values()] - return kbs + async with self._ensure_kb_instances_lock(): + kbs = [kb_helper.kb for kb_helper in self.kb_insts.values()] + return kbs async def update_kb( self, kb_id: str, - kb_name: str, + kb_name: str | None = None, description: str | None = None, emoji: str | None = None, embedding_provider_id: str | None = None, - rerank_provider_id: str | None = None, + rerank_provider_id: str | None | object = _UNSET, chunk_size: int | None = None, chunk_overlap: int | None = None, top_k_dense: int | None = None, top_k_sparse: int | None = None, top_m_final: int | None = None, + index_type: str | None = None, ) -> KBHelper | None: """更新知识库实例""" - kb_helper = await self.get_kb(kb_id) - if not kb_helper: - return None - - kb = kb_helper.kb - previous_state = { - "kb_name": kb.kb_name, - "description": kb.description, - "emoji": kb.emoji, - "embedding_provider_id": kb.embedding_provider_id, - "rerank_provider_id": kb.rerank_provider_id, - "chunk_size": kb.chunk_size, - "chunk_overlap": kb.chunk_overlap, - "top_k_dense": kb.top_k_dense, - "top_k_sparse": kb.top_k_sparse, - "top_m_final": kb.top_m_final, - } - previous_init_error = kb_helper.init_error - - if kb_name is not None: - kb.kb_name = kb_name - if description is not None: - kb.description = description - if emoji is not None: - kb.emoji = emoji - if embedding_provider_id is not None: - kb.embedding_provider_id = embedding_provider_id - kb.rerank_provider_id = rerank_provider_id # 允许设置为 None - if chunk_size is not None: - kb.chunk_size = chunk_size - if chunk_overlap is not None: - kb.chunk_overlap = chunk_overlap - if top_k_dense is not None: - kb.top_k_dense = top_k_dense - if top_k_sparse is not None: - kb.top_k_sparse = top_k_sparse - if top_m_final is not None: - kb.top_m_final = top_m_final - - # Build a new helper first. Keep current vec_db alive until new init succeeds. - new_helper = KBHelper( - kb_db=self.kb_db, - kb=kb, - provider_manager=self.provider_manager, - kb_root_dir=FILES_PATH, - chunker=CHUNKER, - ) - - try: - await new_helper.initialize() - except Exception as e: - # Roll back in-memory settings and keep current helper available. - kb.kb_name = previous_state["kb_name"] - kb.description = previous_state["description"] - kb.emoji = previous_state["emoji"] - kb.embedding_provider_id = previous_state["embedding_provider_id"] - kb.rerank_provider_id = previous_state["rerank_provider_id"] - kb.chunk_size = previous_state["chunk_size"] - kb.chunk_overlap = previous_state["chunk_overlap"] - kb.top_k_dense = previous_state["top_k_dense"] - kb.top_k_sparse = previous_state["top_k_sparse"] - kb.top_m_final = previous_state["top_m_final"] - kb_helper.init_error = previous_init_error - logger.error( - f"知识库 {kb.kb_name}({kb.kb_id}) 重新初始化失败,继续使用旧实例: {e}", - exc_info=True, + async with self._ensure_kb_instances_lock(): + kb_helper = self._get_kb_unlocked(kb_id) + if not kb_helper: + return None + + kb = kb_helper.kb + previous_state = { + "kb_name": kb.kb_name, + "description": kb.description, + "emoji": kb.emoji, + "embedding_provider_id": kb.embedding_provider_id, + "rerank_provider_id": kb.rerank_provider_id, + "chunk_size": kb.chunk_size, + "chunk_overlap": kb.chunk_overlap, + "top_k_dense": kb.top_k_dense, + "top_k_sparse": kb.top_k_sparse, + "top_m_final": kb.top_m_final, + "index_type": kb.index_type, + } + previous_init_error = kb_helper.init_error + + candidate_state = previous_state.copy() + if kb_name is not None: + candidate_state["kb_name"] = kb_name + if description is not None: + candidate_state["description"] = description + if emoji is not None: + candidate_state["emoji"] = emoji + if embedding_provider_id is not None: + candidate_state["embedding_provider_id"] = embedding_provider_id + if rerank_provider_id is not _UNSET: + candidate_state["rerank_provider_id"] = rerank_provider_id + if chunk_size is not None: + candidate_state["chunk_size"] = chunk_size + if chunk_overlap is not None: + candidate_state["chunk_overlap"] = chunk_overlap + if top_k_dense is not None: + candidate_state["top_k_dense"] = top_k_dense + if top_k_sparse is not None: + candidate_state["top_k_sparse"] = top_k_sparse + if top_m_final is not None: + candidate_state["top_m_final"] = top_m_final + if index_type is not None: + candidate_state["index_type"] = index_type + _validate_kb_options( + chunk_size=candidate_state["chunk_size"], + chunk_overlap=candidate_state["chunk_overlap"], + top_k_dense=candidate_state["top_k_dense"], + top_k_sparse=candidate_state["top_k_sparse"], + top_m_final=candidate_state["top_m_final"], + index_type=candidate_state["index_type"], + ) + kb.kb_name = candidate_state["kb_name"] + kb.description = candidate_state["description"] + kb.emoji = candidate_state["emoji"] + kb.embedding_provider_id = candidate_state["embedding_provider_id"] + kb.rerank_provider_id = candidate_state["rerank_provider_id"] # type: ignore[assignment] + kb.chunk_size = candidate_state["chunk_size"] + kb.chunk_overlap = candidate_state["chunk_overlap"] + kb.top_k_dense = candidate_state["top_k_dense"] + kb.top_k_sparse = candidate_state["top_k_sparse"] + kb.top_m_final = candidate_state["top_m_final"] + kb.index_type = candidate_state["index_type"] + + # Build a new helper first. Keep current vec_db alive until new init succeeds. + new_helper = KBHelper( + kb_db=self.kb_db, + kb=kb, + provider_manager=self.provider_manager, + kb_root_dir=FILES_PATH, + chunker=CHUNKER, ) - return kb_helper - async with self.kb_db.get_db() as session: - session.add(kb) - await session.commit() - await session.refresh(kb) + try: + await new_helper.initialize() + except Exception as e: + # Roll back in-memory settings and keep current helper available. + kb.kb_name = previous_state["kb_name"] + kb.description = previous_state["description"] + kb.emoji = previous_state["emoji"] + kb.embedding_provider_id = previous_state["embedding_provider_id"] + kb.rerank_provider_id = previous_state["rerank_provider_id"] + kb.chunk_size = previous_state["chunk_size"] + kb.chunk_overlap = previous_state["chunk_overlap"] + kb.top_k_dense = previous_state["top_k_dense"] + kb.top_k_sparse = previous_state["top_k_sparse"] + kb.top_m_final = previous_state["top_m_final"] + kb.index_type = previous_state["index_type"] + kb_helper.init_error = previous_init_error + logger.error( + f"知识库 {kb.kb_name}({kb.kb_id}) 重新初始化失败,继续使用旧实例: {e}", + exc_info=True, + ) + return kb_helper + + async with self.kb_db.get_db() as session: + session.add(kb) + await session.commit() + await session.refresh(kb) - old_helper = kb_helper - self.kb_insts[kb_id] = new_helper - await old_helper.terminate() - new_helper.init_error = None - return new_helper + old_helper = kb_helper + self._set_kb_instance(kb_id, new_helper) + await old_helper.terminate() + new_helper.init_error = None + return new_helper async def retrieve( self, query: str, - kb_names: list[str], + kb_names: list[str] | None = None, + kb_ids: list[str] | None = None, top_k_fusion: int = 20, - top_m_final: int = 5, + top_m_final: int = DEFAULT_TOP_M_FINAL, + include_trace: bool = False, + retrieval_overrides: dict | None = None, ) -> dict | None: """从指定知识库中检索相关内容""" - kb_ids = [] + resolved_kb_ids = [] kb_id_helper_map = {} unavailable_kbs = [] - for kb_name in kb_names: - if kb_helper := await self.get_kb_by_name(kb_name): - if kb_helper.init_error: - unavailable_kbs.append((kb_name, kb_helper.init_error)) - logger.warning(f"知识库 {kb_name} 不可用: {kb_helper.init_error}") - continue - kb_ids.append(kb_helper.kb.kb_id) - kb_id_helper_map[kb_helper.kb.kb_id] = kb_helper + if kb_ids: + for kb_id in kb_ids: + if kb_helper := await self.get_kb(kb_id): + if kb_helper.init_error: + unavailable_kbs.append((kb_id, kb_helper.init_error)) + logger.warning(f"知识库 {kb_id} 不可用: {kb_helper.init_error}") + continue + resolved_kb_ids.append(kb_helper.kb.kb_id) + kb_id_helper_map[kb_helper.kb.kb_id] = kb_helper + elif kb_names: + for kb_name in kb_names: + if kb_helper := await self.get_kb_by_name(kb_name): + if kb_helper.init_error: + unavailable_kbs.append((kb_name, kb_helper.init_error)) + logger.warning( + f"知识库 {kb_name} 不可用: {kb_helper.init_error}", + ) + continue + resolved_kb_ids.append(kb_helper.kb.kb_id) + kb_id_helper_map[kb_helper.kb.kb_id] = kb_helper + else: + return {} # all requested KBs are unavailable - if not kb_ids and unavailable_kbs: + if not resolved_kb_ids and unavailable_kbs: errors = "; ".join(f"{n}: {e}" for n, e in unavailable_kbs) raise ValueError(f"所有请求的知识库均不可用: {errors}") - if not kb_ids: + if not resolved_kb_ids: return {} - results = await self.retrieval_manager.retrieve( - query=query, - kb_ids=kb_ids, - kb_id_helper_map=kb_id_helper_map, - top_k_fusion=top_k_fusion, - top_m_final=top_m_final, - ) + trace_payload = None + if include_trace: + retrieval_response = await self.retrieval_manager.retrieve_with_trace( + query=query, + kb_ids=resolved_kb_ids, + kb_id_helper_map=kb_id_helper_map, + top_k_fusion=top_k_fusion, + top_m_final=top_m_final, + retrieval_overrides=retrieval_overrides, + ) + results = retrieval_response.results + trace_payload = retrieval_response.trace.to_dict() + else: + results = await self.retrieval_manager.retrieve( + query=query, + kb_ids=resolved_kb_ids, + kb_id_helper_map=kb_id_helper_map, + top_k_fusion=top_k_fusion, + top_m_final=top_m_final, + retrieval_overrides=retrieval_overrides, + ) if not results: - return None + empty_response = { + "context_text": "", + "results": [], + } + if include_trace: + empty_response["trace"] = trace_payload or { + "dense": [], + "sparse": [], + "fusion": [], + "dedup": [], + "rerank": [], + "final": [], + } + return empty_response if include_trace else None context_text = self._format_context(results) @@ -305,6 +567,7 @@ async def retrieve( "kb_name": r.kb_name, "doc_name": r.doc_name, "chunk_index": r.metadata.get("chunk_index", 0), + "source": self._format_result_source(r), "content": r.content, "score": r.score, "char_count": r.metadata.get("char_count", 0), @@ -312,10 +575,40 @@ async def retrieve( for r in results ] - return { + response = { "context_text": context_text, "results": results_dict, } + if include_trace: + response["trace"] = trace_payload + return response + + def _format_result_source(self, result: RetrievalResult) -> dict: + return { + "kb_name": result.kb_name, + "document_name": result.doc_name, + "chunk_index": result.metadata.get("chunk_index", 0), + "section_index": result.metadata.get("section_index"), + "title_path": result.metadata.get("title_path"), + "page_number": result.metadata.get("page_number"), + "parent_chunk_id": result.metadata.get("parent_chunk_id"), + } + + def _format_source_label(self, result: RetrievalResult) -> str: + source = self._format_result_source(result) + details = [] + title_path = source.get("title_path") + if isinstance(title_path, list) and title_path: + details.append(" > ".join(str(title) for title in title_path)) + if source.get("page_number") is not None: + details.append(f"第 {source['page_number']} 页") + if source.get("section_index") is not None: + details.append(f"章节 {source['section_index']}") + + base = f"{result.kb_name} / {result.doc_name}" + if details: + return f"{base} ({'; '.join(details)})" + return base def _format_context(self, results: list[RetrievalResult]) -> str: """格式化知识上下文 @@ -331,7 +624,7 @@ def _format_context(self, results: list[RetrievalResult]) -> str: for i, result in enumerate(results, 1): lines.append(f"【知识 {i}】") - lines.append(f"来源: {result.kb_name} / {result.doc_name}") + lines.append(f"来源: {self._format_source_label(result)}") lines.append(f"内容: {result.content}") lines.append(f"相关度: {result.score:.2f}") lines.append("") @@ -359,11 +652,11 @@ async def upload_from_url( self, kb_id: str, url: str, - chunk_size: int = 512, - chunk_overlap: int = 50, - batch_size: int = 32, - tasks_limit: int = 3, - max_retries: int = 3, + chunk_size: int = DEFAULT_CHUNK_SIZE, + chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, + batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, + tasks_limit: int = DEFAULT_UPLOAD_TASKS_LIMIT, + max_retries: int = DEFAULT_UPLOAD_MAX_RETRIES, progress_callback=None, ) -> KBDocument: """从 URL 上传文档到指定的知识库 diff --git a/astrbot/core/knowledge_base/models.py b/astrbot/core/knowledge_base/models.py index da919a384a..cd0e8290f0 100644 --- a/astrbot/core/knowledge_base/models.py +++ b/astrbot/core/knowledge_base/models.py @@ -3,6 +3,15 @@ from sqlmodel import Field, MetaData, SQLModel, Text, UniqueConstraint +from .capabilities import ( + DEFAULT_CHUNK_OVERLAP, + DEFAULT_CHUNK_SIZE, + DEFAULT_INDEX_TYPE, + DEFAULT_TOP_K_DENSE, + DEFAULT_TOP_K_SPARSE, + DEFAULT_TOP_M_FINAL, +) + class BaseKBModel(SQLModel, table=False): metadata = MetaData() @@ -34,12 +43,14 @@ class KnowledgeBase(BaseKBModel, table=True): embedding_provider_id: str | None = Field(default=None, max_length=100) rerank_provider_id: str | None = Field(default=None, max_length=100) # 分块配置参数 - chunk_size: int | None = Field(default=512, nullable=True) - chunk_overlap: int | None = Field(default=50, nullable=True) + chunk_size: int | None = Field(default=DEFAULT_CHUNK_SIZE, nullable=True) + chunk_overlap: int | None = Field(default=DEFAULT_CHUNK_OVERLAP, nullable=True) + # 索引类型: "flat" (精确) 或 "hnsw" (近似最近邻,适合大规模) + index_type: str | None = Field(default=DEFAULT_INDEX_TYPE, max_length=10) # 检索配置参数 - top_k_dense: int | None = Field(default=50, nullable=True) - top_k_sparse: int | None = Field(default=50, nullable=True) - top_m_final: int | None = Field(default=5, nullable=True) + top_k_dense: int | None = Field(default=DEFAULT_TOP_K_DENSE, nullable=True) + top_k_sparse: int | None = Field(default=DEFAULT_TOP_K_SPARSE, nullable=True) + top_m_final: int | None = Field(default=DEFAULT_TOP_M_FINAL, nullable=True) created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) updated_at: datetime = Field( default_factory=lambda: datetime.now(timezone.utc), @@ -81,6 +92,18 @@ class KBDocument(BaseKBModel, table=True): file_type: str = Field(max_length=20, nullable=False) file_size: int = Field(nullable=False) file_path: str = Field(max_length=512, nullable=False) + source_type: str = Field(default="file", max_length=20, nullable=False) + source_uri: str | None = Field(default=None, sa_type=Text) + content_hash: str | None = Field(default=None, max_length=64, index=True) + parser_name: str | None = Field(default=None, max_length=100) + parser_version: str | None = Field(default=None, max_length=50) + chunker_name: str | None = Field(default=None, max_length=100) + chunker_version: str | None = Field(default=None, max_length=50) + status: str = Field(default="ready", max_length=20, nullable=False, index=True) + error_stage: str | None = Field(default=None, max_length=50) + error_message: str | None = Field(default=None, sa_type=Text) + version: int = Field(default=1, nullable=False) + parent_doc_id: str | None = Field(default=None, max_length=36, index=True) chunk_count: int = Field(default=0, nullable=False) media_count: int = Field(default=0, nullable=False) created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) @@ -88,6 +111,7 @@ class KBDocument(BaseKBModel, table=True): default_factory=lambda: datetime.now(timezone.utc), sa_column_kwargs={"onupdate": datetime.now(timezone.utc)}, ) + indexed_at: datetime | None = Field(default=None) class KBMedia(BaseKBModel, table=True): @@ -118,3 +142,36 @@ class KBMedia(BaseKBModel, table=True): file_size: int = Field(nullable=False) mime_type: str = Field(max_length=100, nullable=False) created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + + +class KBIngestionTask(BaseKBModel, table=True): + """Persistent knowledge-base ingestion task state.""" + + __tablename__ = "kb_ingestion_tasks" # type: ignore + + id: int | None = Field( + primary_key=True, + sa_column_kwargs={"autoincrement": True}, + default=None, + ) + task_id: str = Field( + max_length=36, + nullable=False, + unique=True, + default_factory=lambda: str(uuid.uuid4()), + index=True, + ) + kb_id: str = Field(max_length=36, nullable=False, index=True) + task_type: str = Field(max_length=30, nullable=False, index=True) + status: str = Field(default="pending", max_length=20, nullable=False, index=True) + progress_stage: str | None = Field(default=None, max_length=50) + progress_current: int = Field(default=0, nullable=False) + progress_total: int = Field(default=100, nullable=False) + progress: str | None = Field(default=None, sa_type=Text) + result: str | None = Field(default=None, sa_type=Text) + error: str | None = Field(default=None, sa_type=Text) + created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + sa_column_kwargs={"onupdate": datetime.now(timezone.utc)}, + ) diff --git a/astrbot/core/knowledge_base/parsers/base.py b/astrbot/core/knowledge_base/parsers/base.py index 4ffca9c6f2..c204adcfeb 100644 --- a/astrbot/core/knowledge_base/parsers/base.py +++ b/astrbot/core/knowledge_base/parsers/base.py @@ -20,6 +20,14 @@ class MediaItem: mime_type: str +@dataclass +class TextSegment: + """Parsed text segment with optional source location metadata.""" + + text: str + metadata: dict + + @dataclass class ParseResult: """解析结果 @@ -29,6 +37,7 @@ class ParseResult: text: str media: list[MediaItem] + text_segments: list[TextSegment] | None = None class BaseParser(ABC): diff --git a/astrbot/core/knowledge_base/parsers/pdf_parser.py b/astrbot/core/knowledge_base/parsers/pdf_parser.py index aeeea930a2..811341f25c 100644 --- a/astrbot/core/knowledge_base/parsers/pdf_parser.py +++ b/astrbot/core/knowledge_base/parsers/pdf_parser.py @@ -11,6 +11,7 @@ BaseParser, MediaItem, ParseResult, + TextSegment, ) @@ -35,13 +36,20 @@ async def parse(self, file_content: bytes, file_name: str) -> ParseResult: reader = PdfReader(pdf_file) text_parts = [] + text_segments = [] media_items = [] # 提取文本 - for page in reader.pages: + for page_number, page in enumerate(reader.pages, start=1): text = page.extract_text() if text: text_parts.append(text) + text_segments.append( + TextSegment( + text=text, + metadata={"page_number": page_number}, + ) + ) # 提取图片 image_counter = 0 @@ -98,4 +106,8 @@ async def parse(self, file_content: bytes, file_name: str) -> ParseResult: continue full_text = "\n\n".join(text_parts) - return ParseResult(text=full_text, media=media_items) + return ParseResult( + text=full_text, + media=media_items, + text_segments=text_segments, + ) diff --git a/astrbot/core/knowledge_base/retrieval/__init__.py b/astrbot/core/knowledge_base/retrieval/__init__.py index b7c88075d5..26508c31f2 100644 --- a/astrbot/core/knowledge_base/retrieval/__init__.py +++ b/astrbot/core/knowledge_base/retrieval/__init__.py @@ -3,7 +3,12 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from .manager import RetrievalManager, RetrievalResult + from .manager import ( + RetrievalManager, + RetrievalResult, + RetrievalTrace, + RetrievalWithTrace, + ) from .rank_fusion import FusedResult, RankFusion from .sparse_retriever import SparseResult, SparseRetriever @@ -12,18 +17,32 @@ "RankFusion", "RetrievalManager", "RetrievalResult", + "RetrievalTrace", + "RetrievalWithTrace", "SparseResult", "SparseRetriever", ] def __getattr__(name: str): - if name in {"RetrievalManager", "RetrievalResult"}: - from .manager import RetrievalManager, RetrievalResult + if name in { + "RetrievalManager", + "RetrievalResult", + "RetrievalTrace", + "RetrievalWithTrace", + }: + from .manager import ( + RetrievalManager, + RetrievalResult, + RetrievalTrace, + RetrievalWithTrace, + ) return { "RetrievalManager": RetrievalManager, "RetrievalResult": RetrievalResult, + "RetrievalTrace": RetrievalTrace, + "RetrievalWithTrace": RetrievalWithTrace, }[name] if name in {"FusedResult", "RankFusion"}: diff --git a/astrbot/core/knowledge_base/retrieval/manager.py b/astrbot/core/knowledge_base/retrieval/manager.py index 1d65401ce5..dbb5a483c9 100644 --- a/astrbot/core/knowledge_base/retrieval/manager.py +++ b/astrbot/core/knowledge_base/retrieval/manager.py @@ -3,14 +3,20 @@ 协调稠密检索、稀疏检索和 Rerank,提供统一的检索接口 """ +import json import time from dataclasses import dataclass from typing import TYPE_CHECKING from astrbot import logger from astrbot.core.db.vec_db.base import Result +from astrbot.core.knowledge_base.capabilities import ( + DEFAULT_TOP_K_DENSE, + DEFAULT_TOP_K_SPARSE, + DEFAULT_TOP_M_FINAL, +) from astrbot.core.knowledge_base.kb_db_sqlite import KBSQLiteDatabase -from astrbot.core.knowledge_base.retrieval.rank_fusion import RankFusion +from astrbot.core.knowledge_base.retrieval.rank_fusion import FusedResult, RankFusion from astrbot.core.knowledge_base.retrieval.sparse_retriever import SparseRetriever from astrbot.core.provider.provider import RerankProvider @@ -20,6 +26,13 @@ from astrbot.core.db.vec_db.faiss_impl import FaissVecDB +RetrievalOverrideValue = int | str | None +RetrievalOverrides = dict[str, RetrievalOverrideValue] + +DEDUP_SHINGLE_SIZE = 5 +DEDUP_JACCARD_THRESHOLD = 0.92 + + @dataclass class RetrievalResult: """检索结果""" @@ -34,6 +47,38 @@ class RetrievalResult: metadata: dict +@dataclass +class RetrievalTrace: + """Detailed retrieval pipeline trace for diagnostics.""" + + dense: list[dict] + sparse: list[dict] + fusion: list[dict] + dedup: list[dict] + dedup_removed: list[dict] + rerank: list[dict] + final: list[dict] + + def to_dict(self) -> dict: + return { + "dense": self.dense, + "sparse": self.sparse, + "fusion": self.fusion, + "dedup": self.dedup, + "dedup_removed": self.dedup_removed, + "rerank": self.rerank, + "final": self.final, + } + + +@dataclass +class RetrievalWithTrace: + """Retrieval results with optional pipeline diagnostics.""" + + results: list[RetrievalResult] + trace: RetrievalTrace + + class RetrievalManager: """检索管理器 @@ -67,7 +112,8 @@ async def retrieve( kb_ids: list[str], kb_id_helper_map: dict[str, KBHelper], top_k_fusion: int = 20, - top_m_final: int = 5, + top_m_final: int = DEFAULT_TOP_M_FINAL, + retrieval_overrides: RetrievalOverrides | None = None, ) -> list[RetrievalResult]: """混合检索 @@ -90,24 +136,11 @@ async def retrieve( if not kb_ids: return [] - kb_options: dict = {} - new_kb_ids = [] - for kb_id in kb_ids: - kb_helper = kb_id_helper_map.get(kb_id) - if kb_helper: - kb = kb_helper.kb - kb_options[kb_id] = { - "top_k_dense": kb.top_k_dense or 50, - "top_k_sparse": kb.top_k_sparse or 50, - "top_m_final": kb.top_m_final or 5, - "vec_db": kb_helper.vec_db, - "rerank_provider_id": kb.rerank_provider_id, - } - new_kb_ids.append(kb_id) - else: - logger.warning(f"知识库 ID {kb_id} 实例未找到, 已跳过该知识库的检索") - - kb_ids = new_kb_ids + kb_ids, kb_options = self._build_kb_options( + kb_ids, + kb_id_helper_map, + retrieval_overrides=retrieval_overrides, + ) # 1. 稠密检索 time_start = time.time() @@ -140,15 +173,302 @@ async def retrieve( sparse_results=sparse_results, top_k=top_k_fusion, ) + deduped_results = self._deduplicate_fused_results(fused_results) time_end = time.time() logger.debug( - f"Rank fusion took {time_end - time_start:.2f}s and returned {len(fused_results)} results.", + f"Rank fusion took {time_end - time_start:.2f}s and returned " + f"{len(fused_results)} results; dedup kept {len(deduped_results)}.", ) # 4. 转换为 RetrievalResult (批量获取元数据) - doc_ids = {fr.doc_id for fr in fused_results} + doc_ids = {fr.doc_id for fr in deduped_results} + metadata_map = await self.kb_db.get_documents_with_metadata_batch(doc_ids) + retrieval_results = self._build_retrieval_results( + fused_results=deduped_results, + metadata_map=metadata_map, + ) + + # 5. Rerank + first_rerank = self._get_first_rerank_provider(kb_ids, kb_options) + if first_rerank and retrieval_results: + try: + retrieval_results = await self._rerank( + query=query, + results=retrieval_results, + top_k=top_m_final, + rerank_provider=first_rerank, + ) + except Exception as e: + logger.warning(f"Rerank 执行失败,已跳过重排序并使用融合结果: {e}") + + return retrieval_results[:top_m_final] + + async def retrieve_with_trace( + self, + query: str, + kb_ids: list[str], + kb_id_helper_map: dict[str, KBHelper], + top_k_fusion: int = 20, + top_m_final: int = DEFAULT_TOP_M_FINAL, + retrieval_overrides: RetrievalOverrides | None = None, + ) -> RetrievalWithTrace: + """Hybrid retrieval with detailed stage diagnostics.""" + if not kb_ids: + return RetrievalWithTrace( + results=[], + trace=RetrievalTrace( + dense=[], + sparse=[], + fusion=[], + dedup=[], + dedup_removed=[], + rerank=[], + final=[], + ), + ) + + kb_ids, kb_options = self._build_kb_options( + kb_ids, + kb_id_helper_map, + retrieval_overrides=retrieval_overrides, + ) + + dense_results = await self._dense_retrieve( + query=query, + kb_ids=kb_ids, + kb_options=kb_options, + ) + sparse_results = await self.sparse_retriever.retrieve( + query=query, + kb_ids=kb_ids, + kb_options=kb_options, + ) + fused_results = await self.rank_fusion.fuse( + dense_results=dense_results, + sparse_results=sparse_results, + top_k=top_k_fusion, + ) + deduped_results, dedup_removed_results = ( + self._deduplicate_fused_results_with_trace( + fused_results, + ) + ) + + doc_ids = self._collect_trace_doc_ids( + dense_results=dense_results, + sparse_results=sparse_results, + fused_results=fused_results, + ) metadata_map = await self.kb_db.get_documents_with_metadata_batch(doc_ids) + doc_lookup = { + doc_id: { + "doc_name": metadata["document"].doc_name, + "kb_name": metadata["knowledge_base"].kb_name, + } + for doc_id, metadata in metadata_map.items() + } + + retrieval_results = self._build_retrieval_results( + fused_results=deduped_results, + metadata_map=metadata_map, + ) + rerank_results: list[RetrievalResult] = [] + first_rerank = self._get_first_rerank_provider(kb_ids, kb_options) + if first_rerank and retrieval_results: + try: + retrieval_results = await self._rerank( + query=query, + results=retrieval_results, + top_k=top_m_final, + rerank_provider=first_rerank, + ) + rerank_results = retrieval_results + except Exception as e: + logger.warning(f"Rerank 执行失败,已跳过重排序并使用融合结果: {e}") + + final_results = retrieval_results[:top_m_final] + trace = RetrievalTrace( + dense=self._serialize_dense_trace(dense_results, doc_lookup), + sparse=self._serialize_sparse_trace(sparse_results, doc_lookup), + fusion=self._serialize_fusion_trace(fused_results, doc_lookup), + dedup=self._serialize_fusion_trace(deduped_results, doc_lookup), + dedup_removed=self._serialize_dedup_removed_trace( + dedup_removed_results, + doc_lookup, + ), + rerank=self._serialize_retrieval_trace(rerank_results, "rerank"), + final=self._serialize_retrieval_trace(final_results, "final"), + ) + return RetrievalWithTrace(results=final_results, trace=trace) + + def _build_kb_options( + self, + kb_ids: list[str], + kb_id_helper_map: dict[str, KBHelper], + *, + retrieval_overrides: RetrievalOverrides | None = None, + ) -> tuple[list[str], dict]: + kb_options: dict = {} + valid_kb_ids = [] + for kb_id in kb_ids: + kb_helper = kb_id_helper_map.get(kb_id) + if not kb_helper: + logger.warning(f"知识库 ID {kb_id} 实例未找到, 已跳过该知识库的检索") + continue + kb = kb_helper.kb + kb_option = { + "top_k_dense": kb.top_k_dense or DEFAULT_TOP_K_DENSE, + "top_k_sparse": kb.top_k_sparse or DEFAULT_TOP_K_SPARSE, + "top_m_final": kb.top_m_final or DEFAULT_TOP_M_FINAL, + "vec_db": kb_helper.vec_db, + "rerank_provider_id": kb.rerank_provider_id, + } + if retrieval_overrides: + for field_name in ( + "top_k_dense", + "top_k_sparse", + "top_m_final", + "rerank_provider_id", + ): + if field_name in retrieval_overrides: + kb_option[field_name] = retrieval_overrides[field_name] + kb_options[kb_id] = kb_option + valid_kb_ids.append(kb_id) + return valid_kb_ids, kb_options + + def _collect_trace_doc_ids( + self, + *, + dense_results: list[Result], + sparse_results, + fused_results, + ) -> set[str]: + doc_ids = {result.doc_id for result in sparse_results} + doc_ids.update(result.doc_id for result in fused_results) + for result in dense_results: + metadata = self._safe_metadata(result.data.get("metadata")) + doc_id = metadata.get("kb_doc_id") + if doc_id: + doc_ids.add(doc_id) + return doc_ids + + def _deduplicate_fused_results( + self, + fused_results: list[FusedResult], + ) -> list[FusedResult]: + deduped_results, _ = self._deduplicate_fused_results_with_trace(fused_results) + return deduped_results + + def _deduplicate_fused_results_with_trace( + self, + fused_results: list[FusedResult], + ) -> tuple[list[FusedResult], list[dict]]: + selected: list[FusedResult] = [] + removed: list[dict] = [] + signatures: list[tuple[FusedResult, str, frozenset[str]]] = [] + + for result in fused_results: + normalized = self._normalize_content_for_dedup(result.content) + if not normalized: + selected.append(result) + continue + + shingles = self._build_content_shingles(normalized) + duplicate_of = self._find_duplicate_signature( + normalized, + shingles, + signatures, + ) + if duplicate_of: + selected_result, selected_normalized, selected_shingles = duplicate_of + removed.append( + { + "result": result, + "duplicate_of": selected_result, + "similarity": self._dedup_similarity( + normalized, + shingles, + selected_normalized, + selected_shingles, + ), + }, + ) + continue + + selected.append(result) + signatures.append((result, normalized, shingles)) + + return selected, removed + + @staticmethod + def _normalize_content_for_dedup(content: str) -> str: + return "".join(str(content or "").lower().split()) + + @staticmethod + def _build_content_shingles( + normalized_content: str, + size: int = DEDUP_SHINGLE_SIZE, + ) -> frozenset[str]: + if not normalized_content: + return frozenset() + if len(normalized_content) <= size: + return frozenset({normalized_content}) + return frozenset( + normalized_content[index : index + size] + for index in range(len(normalized_content) - size + 1) + ) + + @staticmethod + def _is_duplicate_signature( + normalized: str, + shingles: frozenset[str], + existing: tuple[FusedResult, str, frozenset[str]], + ) -> bool: + _, existing_normalized, existing_shingles = existing + return ( + RetrievalManager._dedup_similarity( + normalized, + shingles, + existing_normalized, + existing_shingles, + ) + >= DEDUP_JACCARD_THRESHOLD + ) + + @staticmethod + def _dedup_similarity( + normalized: str, + shingles: frozenset[str], + existing_normalized: str, + existing_shingles: frozenset[str], + ) -> float: + if normalized == existing_normalized: + return 1.0 + if not shingles or not existing_shingles: + return 0.0 + union = len(shingles | existing_shingles) + if union == 0: + return 0.0 + return len(shingles & existing_shingles) / union + + def _find_duplicate_signature( + self, + normalized: str, + shingles: frozenset[str], + signatures: list[tuple[FusedResult, str, frozenset[str]]], + ) -> tuple[FusedResult, str, frozenset[str]] | None: + for signature in signatures: + if self._is_duplicate_signature(normalized, shingles, signature): + return signature + return None + + def _build_retrieval_results( + self, + *, + fused_results, + metadata_map: dict, + ) -> list[RetrievalResult]: retrieval_results = [] for fr in fused_results: metadata_dict = metadata_map.get(fr.doc_id) @@ -163,13 +483,22 @@ async def retrieve( content=fr.content, score=fr.score, metadata={ + **(fr.metadata or {}), "chunk_index": fr.chunk_index, "char_count": len(fr.content), + "dense_rank": fr.dense_rank, + "sparse_rank": fr.sparse_rank, + "dense_score": fr.dense_score, + "sparse_score": fr.sparse_score, + "rrf_score": fr.rrf_score + if fr.rrf_score is not None + else fr.score, }, ), ) + return retrieval_results - # 5. Rerank + def _get_first_rerank_provider(self, kb_ids: list[str], kb_options: dict): first_rerank = None for kb_id in kb_ids: vec_db = kb_options[kb_id]["vec_db"] @@ -188,18 +517,186 @@ async def retrieve( ): first_rerank = rerank_provider break - if first_rerank and retrieval_results: - try: - retrieval_results = await self._rerank( - query=query, - results=retrieval_results, - top_k=top_m_final, - rerank_provider=first_rerank, - ) - except Exception as e: - logger.warning(f"Rerank 执行失败,已跳过重排序并使用融合结果: {e}") + return first_rerank - return retrieval_results[:top_m_final] + @staticmethod + def _content_preview(content: str, limit: int = 240) -> str: + if len(content) <= limit: + return content + return f"{content[:limit]}..." + + def _serialize_dense_trace( + self, + dense_results: list[Result], + doc_lookup: dict[str, dict], + ) -> list[dict]: + trace = [] + for rank, result in enumerate(dense_results, 1): + chunk_id = result.data.get("doc_id") + metadata = self._safe_metadata(result.data.get("metadata")) + doc_id = metadata.get("kb_doc_id") + source = doc_lookup.get(doc_id, {}) + trace.append( + { + "rank": rank, + "chunk_id": chunk_id, + "doc_id": doc_id, + "doc_name": source.get("doc_name"), + "kb_id": metadata.get("kb_id"), + "kb_name": source.get("kb_name"), + "chunk_index": metadata.get("chunk_index", 0), + "score": result.similarity, + "dense_score": result.similarity, + "title_path": metadata.get("title_path"), + "page_number": metadata.get("page_number"), + "section_index": metadata.get("section_index"), + "content_preview": self._content_preview( + result.data.get("text", ""), + ), + }, + ) + return trace + + def _serialize_sparse_trace( + self, + sparse_results, + doc_lookup: dict[str, dict], + ) -> list[dict]: + trace = [] + for rank, result in enumerate(sparse_results, 1): + source = doc_lookup.get(result.doc_id, {}) + trace.append( + { + "rank": rank, + "chunk_id": result.chunk_id, + "doc_id": result.doc_id, + "doc_name": source.get("doc_name"), + "kb_id": result.kb_id, + "kb_name": source.get("kb_name"), + "chunk_index": result.chunk_index, + "score": result.score, + "sparse_score": result.score, + "title_path": (result.metadata or {}).get("title_path"), + "page_number": (result.metadata or {}).get("page_number"), + "section_index": (result.metadata or {}).get("section_index"), + "content_preview": self._content_preview(result.content), + }, + ) + return trace + + def _serialize_fusion_trace( + self, + fused_results, + doc_lookup: dict[str, dict], + ) -> list[dict]: + trace = [] + for rank, result in enumerate(fused_results, 1): + source = doc_lookup.get(result.doc_id, {}) + trace.append( + { + "rank": rank, + "chunk_id": result.chunk_id, + "doc_id": result.doc_id, + "doc_name": source.get("doc_name"), + "kb_id": result.kb_id, + "kb_name": source.get("kb_name"), + "chunk_index": result.chunk_index, + "score": result.score, + "dense_rank": result.dense_rank, + "sparse_rank": result.sparse_rank, + "dense_score": result.dense_score, + "sparse_score": result.sparse_score, + "rrf_score": result.rrf_score + if result.rrf_score is not None + else result.score, + "title_path": (result.metadata or {}).get("title_path"), + "page_number": (result.metadata or {}).get("page_number"), + "section_index": (result.metadata or {}).get("section_index"), + "content_preview": self._content_preview(result.content), + }, + ) + return trace + + def _serialize_dedup_removed_trace( + self, + removed_results: list[dict], + doc_lookup: dict[str, dict], + ) -> list[dict]: + trace = [] + for rank, removed in enumerate(removed_results, 1): + result = removed["result"] + duplicate_of = removed["duplicate_of"] + source = doc_lookup.get(result.doc_id, {}) + trace.append( + { + "rank": rank, + "chunk_id": result.chunk_id, + "doc_id": result.doc_id, + "doc_name": source.get("doc_name"), + "kb_id": result.kb_id, + "kb_name": source.get("kb_name"), + "chunk_index": result.chunk_index, + "score": result.score, + "dense_rank": result.dense_rank, + "sparse_rank": result.sparse_rank, + "dense_score": result.dense_score, + "sparse_score": result.sparse_score, + "rrf_score": result.rrf_score + if result.rrf_score is not None + else result.score, + "duplicate_of_chunk_id": duplicate_of.chunk_id, + "duplicate_of_doc_id": duplicate_of.doc_id, + "dedup_similarity": removed["similarity"], + "title_path": (result.metadata or {}).get("title_path"), + "page_number": (result.metadata or {}).get("page_number"), + "section_index": (result.metadata or {}).get("section_index"), + "content_preview": self._content_preview(result.content), + }, + ) + return trace + + def _serialize_retrieval_trace( + self, + results: list[RetrievalResult], + stage: str, + ) -> list[dict]: + trace = [] + for rank, result in enumerate(results, 1): + trace.append( + { + "rank": rank, + "chunk_id": result.chunk_id, + "doc_id": result.doc_id, + "doc_name": result.doc_name, + "kb_id": result.kb_id, + "kb_name": result.kb_name, + "chunk_index": result.metadata.get("chunk_index", 0), + "score": result.score, + "dense_rank": result.metadata.get("dense_rank"), + "sparse_rank": result.metadata.get("sparse_rank"), + "dense_score": result.metadata.get("dense_score"), + "sparse_score": result.metadata.get("sparse_score"), + "rrf_score": result.metadata.get("rrf_score"), + "rerank_score": result.metadata.get("rerank_score"), + "title_path": result.metadata.get("title_path"), + "page_number": result.metadata.get("page_number"), + "section_index": result.metadata.get("section_index"), + "stage": stage, + "content_preview": self._content_preview(result.content), + }, + ) + return trace + + @staticmethod + def _safe_metadata(raw_metadata) -> dict: + if not raw_metadata: + return {} + if isinstance(raw_metadata, dict): + return raw_metadata + try: + return json.loads(raw_metadata) + except Exception: + return {} async def _dense_retrieve( self, @@ -209,7 +706,7 @@ async def _dense_retrieve( ): """稠密检索 (向量相似度) - 为每个知识库使用独立的向量数据库进行检索,然后合并结果。 + 为每个知识库使用独立的向量数据库进行并行检索,然后合并结果。 Args: query: 查询文本 @@ -220,10 +717,11 @@ async def _dense_retrieve( List[Result]: 检索结果列表 """ - all_results: list[Result] = [] - for kb_id in kb_ids: + import asyncio + + async def _retrieve_one(kb_id: str) -> list[Result]: if kb_id not in kb_options: - continue + return [] try: vec_db: FaissVecDB = kb_options[kb_id]["vec_db"] dense_k = int(kb_options[kb_id]["top_k_dense"]) @@ -234,17 +732,31 @@ async def _dense_retrieve( rerank=False, # 稠密检索阶段不进行 rerank metadata_filters={"kb_id": kb_id}, ) - - all_results.extend(vec_results) + return vec_results except Exception as e: - logger.error(f"知识库 {kb_id} 稠密检索失败: {e}", exc_info=True) + logger.error( + f"知识库 {kb_id} 稠密检索失败: {e}", + exc_info=True, + ) if len(kb_ids) == 1: - raise RuntimeError(f"知识库 {kb_id} 稠密检索失败: {e}") from e + raise RuntimeError( + f"知识库 {kb_id} 稠密检索失败: {e}", + ) from e # multi-KB: skip the faulty KB and continue + return [] + + tasks = [_retrieve_one(kb_id) for kb_id in kb_ids] + results_per_kb = await asyncio.gather(*tasks, return_exceptions=True) + + all_results: list[Result] = [] + for result in results_per_kb: + if isinstance(result, Exception): + logger.error(f"稠密检索异常: {result}", exc_info=True) + continue + all_results.extend(result) - # 按相似度排序并返回 top_k + # 按相似度排序并返回 all_results.sort(key=lambda x: x.similarity, reverse=True) - # return all_results[: len(all_results) // len(kb_ids)] return all_results async def _rerank( @@ -283,6 +795,7 @@ async def _rerank( idx = rerank_result.index if idx < len(results): result = results[idx] + result.metadata["rerank_score"] = rerank_result.relevance_score result.score = rerank_result.relevance_score reranked_list.append(result) diff --git a/astrbot/core/knowledge_base/retrieval/rank_fusion.py b/astrbot/core/knowledge_base/retrieval/rank_fusion.py index 40afd97484..2dbb1a5bef 100644 --- a/astrbot/core/knowledge_base/retrieval/rank_fusion.py +++ b/astrbot/core/knowledge_base/retrieval/rank_fusion.py @@ -6,6 +6,7 @@ import json from dataclasses import dataclass +from astrbot.core import logger from astrbot.core.db.vec_db.base import Result from astrbot.core.knowledge_base.kb_db_sqlite import KBSQLiteDatabase from astrbot.core.knowledge_base.retrieval.sparse_retriever import SparseResult @@ -21,6 +22,12 @@ class FusedResult: kb_id: str content: str score: float + metadata: dict | None = None + dense_rank: int | None = None + sparse_rank: int | None = None + dense_score: float | None = None + sparse_score: float | None = None + rrf_score: float | None = None class RankFusion: @@ -62,28 +69,27 @@ async def fuse( List[FusedResult]: 融合后的结果列表 """ - # 1. 构建排名映射 + # 1. Build rank maps keyed by vector-storage chunk IDs. dense_ranks = { r.data["doc_id"]: (idx + 1) for idx, r in enumerate(dense_results) - } # 这里的 doc_id 实际上是 chunk_id + } sparse_ranks = {r.chunk_id: (idx + 1) for idx, r in enumerate(sparse_results)} - # 2. 收集所有唯一的 ID - # 需要统一为 chunk_id + # 2. Collect all unique chunk IDs. all_chunk_ids = set() - vec_doc_id_to_dense: dict[str, Result] = {} # vec_doc_id -> Result - chunk_id_to_sparse: dict[str, SparseResult] = {} # chunk_id -> SparseResult + chunk_id_to_dense: dict[str, Result] = {} + chunk_id_to_sparse: dict[str, SparseResult] = {} # 处理稀疏检索结果 for r in sparse_results: all_chunk_ids.add(r.chunk_id) chunk_id_to_sparse[r.chunk_id] = r - # 处理稠密检索结果 (需要转换 vec_doc_id 到 chunk_id) + # Dense results use Document.doc_id, which stores the chunk UUID. for r in dense_results: - vec_doc_id = r.data["doc_id"] - all_chunk_ids.add(vec_doc_id) - vec_doc_id_to_dense[vec_doc_id] = r + chunk_id = r.data["doc_id"] + all_chunk_ids.add(chunk_id) + chunk_id_to_dense[chunk_id] = r # 3. 计算 RRF 分数 rrf_scores: dict[str, float] = {} @@ -108,6 +114,15 @@ async def fuse( reverse=True, )[:top_k] + if logger.isEnabledFor(10): # DEBUG + details = [] + for cid in sorted_ids[:5]: + d_rank = dense_ranks.get(cid, "-") + s_rank = sparse_ranks.get(cid, "-") + rrf = rrf_scores[cid] + details.append(f"{cid[:8]}(d={d_rank},s={s_rank},rrf={rrf:.4f})") + logger.debug(f"RRF top-5: {' | '.join(details)}") + # 5. 构建融合结果 fused_results = [] for identifier in sorted_ids: @@ -122,11 +137,21 @@ async def fuse( kb_id=sr.kb_id, content=sr.content, score=rrf_scores[identifier], + metadata=sr.metadata, + dense_rank=dense_ranks.get(identifier), + sparse_rank=sparse_ranks.get(identifier), + dense_score=( + chunk_id_to_dense[identifier].similarity + if identifier in chunk_id_to_dense + else None + ), + sparse_score=sr.score, + rrf_score=rrf_scores[identifier], ), ) - elif identifier in vec_doc_id_to_dense: + elif identifier in chunk_id_to_dense: # 从向量检索获取信息,需要从数据库获取块的详细信息 - vec_result = vec_doc_id_to_dense[identifier] + vec_result = chunk_id_to_dense[identifier] chunk_md = json.loads(vec_result.data["metadata"]) fused_results.append( FusedResult( @@ -136,6 +161,12 @@ async def fuse( kb_id=chunk_md["kb_id"], content=vec_result.data["text"], score=rrf_scores[identifier], + metadata=chunk_md, + dense_rank=dense_ranks.get(identifier), + sparse_rank=sparse_ranks.get(identifier), + dense_score=vec_result.similarity, + sparse_score=None, + rrf_score=rrf_scores[identifier], ), ) diff --git a/astrbot/core/knowledge_base/retrieval/sparse_retriever.py b/astrbot/core/knowledge_base/retrieval/sparse_retriever.py index f06eb50909..8790d0224c 100644 --- a/astrbot/core/knowledge_base/retrieval/sparse_retriever.py +++ b/astrbot/core/knowledge_base/retrieval/sparse_retriever.py @@ -10,6 +10,7 @@ from rank_bm25 import BM25Okapi +from astrbot.core import logger from astrbot.core.knowledge_base.kb_db_sqlite import KBSQLiteDatabase from astrbot.core.knowledge_base.retrieval.tokenizer import ( load_stopwords, @@ -22,7 +23,10 @@ @dataclass class SparseResult: - """稀疏检索结果""" + """稀疏检索结果 + + score 语义: 越低越相关 (0 = 最佳匹配), 统一按升序排列后送入 RRF 融合。 + """ chunk_index: int chunk_id: str @@ -30,25 +34,15 @@ class SparseResult: kb_id: str content: str score: float + metadata: dict | None = None class SparseRetriever: - """BM25 稀疏检索器 - - 职责: - - 基于关键词的文档检索 - - 使用 BM25 算法计算相关度 - """ + """BM25 稀疏检索器""" def __init__(self, kb_db: KBSQLiteDatabase) -> None: - """初始化稀疏检索器 - - Args: - kb_db: 知识库数据库实例 - - """ self.kb_db = kb_db - self._index_cache = {} # 缓存 BM25 索引 + self._index_cache = {} self.hit_stopwords = load_stopwords( os.path.join(os.path.dirname(__file__), "hit_stopwords.txt"), @@ -62,18 +56,13 @@ async def retrieve( ) -> list[SparseResult]: """执行稀疏检索 - Args: - query: 查询文本 - kb_ids: 知识库 ID 列表 - kb_options: 每个知识库的检索选项 - - Returns: - List[SparseResult]: 检索结果列表 - + 优先使用 FTS5 全文索引; 不可用时回退到内存 BM25。 + 结果按 score 升序排列 (lower-is-better), 直接喂给 RRF。 """ fts_results = [] fallback_kb_ids = [] query_tokens = tokenize_text(query, self.hit_stopwords) + for kb_id in kb_ids: vec_db: FaissVecDB | None = kb_options.get(kb_id, {}).get("vec_db") if not vec_db: @@ -89,6 +78,7 @@ async def retrieve( for doc in result: chunk_md = json.loads(doc["metadata"]) + # FTS5 bm25(): 0=最佳, 极短文档可能为负值 → clamp 到 0 fts_results.append( SparseResult( chunk_id=doc["doc_id"], @@ -96,7 +86,8 @@ async def retrieve( doc_id=chunk_md["kb_doc_id"], kb_id=kb_id, content=doc["text"], - score=-float(doc["score"]), + score=max(0.0, float(doc["score"])), + metadata=chunk_md, ), ) @@ -107,70 +98,106 @@ async def retrieve( kb_ids=fallback_kb_ids, kb_options=kb_options, ) + results = fts_results + fallback_results - results.sort(key=lambda x: x.score, reverse=True) + results.sort(key=lambda x: x.score) + + if logger.isEnabledFor(10): # DEBUG + fts_top = [f"{r.chunk_id[:8]}={r.score:.4f}" for r in fts_results[:5]] + bm_top = [f"{r.chunk_id[:8]}={r.score:.4f}" for r in fallback_results[:5]] + merged_top = [f"{r.chunk_id[:8]}={r.score:.4f}" for r in results[:5]] + logger.debug( + f"Sparse top-5 | FTS5({len(fts_results)}): [{', '.join(fts_top)}] | " + f"BM25({len(fallback_results)}): [{', '.join(bm_top)}] | " + f"Merged({len(results)}): [{', '.join(merged_top)}]", + ) + return results + # BM25 回退路径单次最多加载的文档数,防止 OOM + MAX_BM25_DOCS = 10_000 + async def _retrieve_with_bm25( self, query: str, kb_ids: list[str], kb_options: dict, ) -> list[SparseResult]: + """FTS5 不可用时的 BM25Okapi 回退路径。 + + BM25Okapi 原始分值 higher-is-better → 取反统一为 lower-is-better。 + 单 KB 最多加载 MAX_BM25_DOCS 条 chunk,超限时截断并打 warning。 + """ top_k_sparse = 0 - chunks = [] + all_kb_chunks: list[dict] = [] + for kb_id in kb_ids: vec_db: FaissVecDB | None = kb_options.get(kb_id, {}).get("vec_db") if not vec_db: continue + kb_top_k = kb_options.get(kb_id, {}).get("top_k_sparse", 50) + top_k_sparse = max(top_k_sparse, kb_top_k) + result = await vec_db.document_storage.get_documents( - metadata_filters={}, - limit=None, - offset=None, + metadata_filters={"kb_id": kb_id}, + limit=self.MAX_BM25_DOCS, + offset=0, ) + if len(result) >= self.MAX_BM25_DOCS: + logger.warning( + f"知识库 {kb_id} 的 BM25 回退检索已触及 {self.MAX_BM25_DOCS} " + f"条 chunk 上限,结果可能不完整。建议检查 FTS5 索引状态。", + ) chunk_mds = [json.loads(doc["metadata"]) for doc in result] - result = [ + kb_chunks = [ { "chunk_id": doc["doc_id"], "chunk_index": chunk_md["chunk_index"], "doc_id": chunk_md["kb_doc_id"], "kb_id": kb_id, "text": doc["text"], + "kb_top_k": kb_top_k, + "metadata": chunk_md, } for doc, chunk_md in zip(result, chunk_mds) ] - chunks.extend(result) - top_k_sparse += kb_options.get(kb_id, {}).get("top_k_sparse", 50) + all_kb_chunks.append(kb_chunks) - if not chunks: + if not any(all_kb_chunks): return [] - # 2. 准备文档和索引 - corpus = [chunk["text"] for chunk in chunks] - tokenized_corpus = [tokenize_text(doc, self.hit_stopwords) for doc in corpus] - - # 3. 构建 BM25 索引 - bm25 = BM25Okapi(tokenized_corpus) - - # 4. 执行检索 - tokenized_query = tokenize_text(query, self.hit_stopwords) - scores = bm25.get_scores(tokenized_query) - - # 5. 排序并返回 Top-K - results = [] - for idx, score in enumerate(scores): - chunk = chunks[idx] - results.append( - SparseResult( - chunk_id=chunk["chunk_id"], - chunk_index=chunk["chunk_index"], - doc_id=chunk["doc_id"], - kb_id=chunk["kb_id"], - content=chunk["text"], - score=float(score), - ), - ) + # 每个知识库独立计算 BM25 分数并截断,再合并。 + merged_results: list[SparseResult] = [] + for kb_chunks in all_kb_chunks: + if not kb_chunks: + continue + kb_top_k = kb_chunks[0]["kb_top_k"] + + corpus = [chunk["text"] for chunk in kb_chunks] + tokenized_corpus = [ + tokenize_text(doc, self.hit_stopwords) for doc in corpus + ] + bm25 = BM25Okapi(tokenized_corpus) + + tokenized_query = tokenize_text(query, self.hit_stopwords) + scores = bm25.get_scores(tokenized_query) + + kb_results: list[SparseResult] = [] + for idx, score in enumerate(scores): + chunk = kb_chunks[idx] + kb_results.append( + SparseResult( + chunk_id=chunk["chunk_id"], + chunk_index=chunk["chunk_index"], + doc_id=chunk["doc_id"], + kb_id=chunk["kb_id"], + content=chunk["text"], + score=-float(score), + metadata=chunk["metadata"], + ), + ) + + merged_results.extend(sorted(kb_results, key=lambda x: x.score)[:kb_top_k]) - results.sort(key=lambda x: x.score, reverse=True) - # return results[: len(results) // len(kb_ids)] - return results[:top_k_sparse] + merged_results.sort(key=lambda x: x.score) + return merged_results[:top_k_sparse] diff --git a/astrbot/core/tools/knowledge_base_tools.py b/astrbot/core/tools/knowledge_base_tools.py index e082fd4253..da00c18f47 100644 --- a/astrbot/core/tools/knowledge_base_tools.py +++ b/astrbot/core/tools/knowledge_base_tools.py @@ -53,7 +53,7 @@ async def retrieve_knowledge_base( f"[知识库] 会话 {umo} 配置的以下知识库无效: {invalid_kb_ids}", ) if not kb_names: - return None + return "会话配置的知识库均不存在或未加载,请检查知识库设置。" logger.debug(f"[知识库] 使用会话级配置,知识库数量: {len(kb_names)}") else: kb_names = config.get("kb_names", []) diff --git a/astrbot/dashboard/routes/knowledge_base.py b/astrbot/dashboard/routes/knowledge_base.py index 1b6f7a435d..ca97f296ea 100644 --- a/astrbot/dashboard/routes/knowledge_base.py +++ b/astrbot/dashboard/routes/knowledge_base.py @@ -11,6 +11,29 @@ from astrbot.core import logger from astrbot.core.core_lifecycle import AstrBotCoreLifecycle +from astrbot.core.knowledge_base.capabilities import ( + ALLOWED_UPLOAD_EXTENSIONS, + DEFAULT_CHUNK_OVERLAP, + DEFAULT_CHUNK_PAGE_SIZE, + DEFAULT_CHUNK_SIZE, + DEFAULT_DOCUMENT_PAGE_SIZE, + DEFAULT_INDEX_TYPE, + DEFAULT_KB_PAGE_SIZE, + DEFAULT_TOP_K_DENSE, + DEFAULT_TOP_K_SPARSE, + DEFAULT_TOP_M_FINAL, + DEFAULT_UPLOAD_BATCH_SIZE, + DEFAULT_UPLOAD_MAX_RETRIES, + DEFAULT_UPLOAD_TASKS_LIMIT, + DOCUMENT_FILTER_SOURCE_TYPES, + DOCUMENT_FILTER_STATUSES, + MAX_BATCH_DELETE_DOCUMENTS, + MAX_BATCH_REBUILD_DOCUMENTS, + MAX_RETRIEVE_TOP_K, + MAX_UPLOAD_FILE_SIZE, + MAX_UPLOAD_FILES, + get_knowledge_base_capabilities, +) from astrbot.core.provider.provider import EmbeddingProvider, RerankProvider from astrbot.core.utils.astrbot_path import get_astrbot_temp_path @@ -41,12 +64,16 @@ def __init__( # 注册路由 self.routes = { # 知识库管理 + "/kb/capabilities": ("GET", self.get_capabilities), "/kb/list": ("GET", self.list_kbs), "/kb/create": ("POST", self.create_kb), "/kb/get": ("GET", self.get_kb), "/kb/update": ("POST", self.update_kb), "/kb/delete": ("POST", self.delete_kb), "/kb/stats": ("GET", self.get_kb_stats), + "/kb/consistency/check": ("GET", self.check_kb_consistency), + "/kb/consistency/repair": ("POST", self.repair_kb_consistency), + "/kb/rebuild": ("POST", self.rebuild_kb), # 文档管理 "/kb/document/list": ("GET", self.list_documents), "/kb/document/upload": ("POST", self.upload_document), @@ -54,9 +81,15 @@ def __init__( "/kb/document/upload/url": ("POST", self.upload_document_from_url), "/kb/document/upload/progress": ("GET", self.get_upload_progress), "/kb/document/get": ("GET", self.get_document), + "/kb/document/rebuild": ("POST", self.rebuild_document), + "/kb/document/batch-rebuild": ("POST", self.batch_rebuild_documents), "/kb/document/delete": ("POST", self.delete_document), + "/kb/document/batch-delete": ("POST", self.batch_delete_documents), + "/kb/task/get": ("GET", self.get_task), + "/kb/task/list": ("GET", self.list_tasks), # # 块管理 "/kb/chunk/list": ("GET", self.list_chunks), + "/kb/chunk/context": ("GET", self.get_chunk_context), "/kb/chunk/delete": ("POST", self.delete_chunk), # # 多媒体管理 # "/kb/media/list": ("GET", self.list_media), @@ -69,6 +102,77 @@ def __init__( def _get_kb_manager(self): return self.core_lifecycle.kb_manager + def _get_kb_db(self): + if not hasattr(self, "core_lifecycle"): + return None + kb_manager = self._get_kb_manager() + return getattr(kb_manager, "kb_db", None) + + @staticmethod + def _get_positive_query_int(name: str, default: int) -> int: + value = request.args.get(name, default, type=int) + return max(value if value is not None else default, 1) + + async def get_capabilities(self): + """Return knowledge base capabilities, defaults, and limits.""" + return Response().ok(get_knowledge_base_capabilities()).__dict__ + + async def _create_persistent_task( + self, + *, + task_id: str, + kb_id: str | None, + task_type: str, + status: str, + progress: dict | None = None, + ) -> None: + kb_db = self._get_kb_db() + if not kb_db or not kb_id: + return + try: + await kb_db.create_ingestion_task( + task_id=task_id, + kb_id=kb_id, + task_type=task_type, + status=status, + progress_stage=(progress or {}).get("stage"), + progress_current=(progress or {}).get("current", 0), + progress_total=(progress or {}).get("total", 100), + progress=progress, + ) + except Exception as e: + logger.warning(f"创建知识库持久任务记录失败 {task_id}: {e}") + + async def _update_persistent_task(self, task_id: str, **updates) -> None: + kb_db = self._get_kb_db() + if not kb_db: + return + try: + await kb_db.update_ingestion_task(task_id, **updates) + except Exception as e: + logger.warning(f"更新知识库持久任务记录失败 {task_id}: {e}") + + async def _get_persistent_task(self, task_id: str) -> dict | None: + kb_db = self._get_kb_db() + if not kb_db: + return None + try: + return await kb_db.get_ingestion_task(task_id) + except Exception as e: + logger.warning(f"读取知识库持久任务记录失败 {task_id}: {e}") + return None + + def _get_persistent_progress_updates(self, task_id: str) -> dict: + progress = self.upload_progress.get(task_id) + if not progress: + return {} + return { + "progress_stage": progress.get("stage"), + "progress_current": progress.get("current", 0), + "progress_total": progress.get("total", 100), + "progress": progress, + } + def _init_task(self, task_id: str, status: str = "pending") -> None: self.upload_tasks[task_id] = { "status": status, @@ -87,6 +191,21 @@ def _set_task_result( if task_id in self.upload_progress: self.upload_progress[task_id]["status"] = status + def _cleanup_task(self, task_id: str) -> None: + """清理已完成/失败的任务,释放内存。幂等操作。""" + self.upload_tasks.pop(task_id, None) + self.upload_progress.pop(task_id, None) + + async def _schedule_delayed_cleanup( + self, task_id: str, delay_seconds: int = 300 + ) -> None: + """延迟清理任务,作为客户端不轮询时的兜底机制。""" + try: + await asyncio.sleep(delay_seconds) + except asyncio.CancelledError: + return + self._cleanup_task(task_id) + def _update_progress( self, task_id: str, @@ -114,6 +233,16 @@ def _update_progress( if total is not None: p["total"] = total + async def _persist_progress(self, task_id: str) -> None: + progress = self.upload_progress.get(task_id) + if not progress: + return + await self._update_persistent_task( + task_id, + status=progress.get("status"), + **self._get_persistent_progress_updates(task_id), + ) + def _make_progress_callback(self, task_id: str, file_idx: int, file_name: str): async def _callback(stage: str, current: int, total: int) -> None: self._update_progress( @@ -125,16 +254,140 @@ async def _callback(stage: str, current: int, total: int) -> None: current=current, total=total, ) + await self._persist_progress(task_id) return _callback @staticmethod def _format_failed_doc_error(file_name: str, error: Exception) -> str: message = str(error).strip() or "上传失败:发生未知错误。" - if message.startswith(file_name): + if message.startswith(f"{file_name}:"): return message return f"{file_name}: {message}" + @staticmethod + def _resolve_batch_task_status(success_count: int, failed_count: int) -> str: + if failed_count == 0: + return "completed" + if success_count > 0: + return "partial_failed" + return "failed" + + @staticmethod + def _build_batch_failure_error( + failed_docs: list[dict], + success_count: int = 0, + action: str = "上传", + ) -> str | None: + if not failed_docs: + return None + if len(failed_docs) == 1: + return failed_docs[0].get("error") or "上传失败:发生未知错误。" + if success_count > 0: + return f"部分文档{action}失败,共 {len(failed_docs)} 个失败。" + return f"所有文档{action}失败,共 {len(failed_docs)} 个失败。" + + @staticmethod + def _format_size_limit(size_bytes: int) -> str: + size_mb = size_bytes / (1024 * 1024) + if size_mb.is_integer(): + return f"{int(size_mb)}MB" + return f"{size_mb:.2f}MB" + + @staticmethod + def _coerce_optional_int(value: Any, field_name: str) -> int | None: + if value in (None, ""): + return None + try: + return int(value) + except (TypeError, ValueError) as e: + raise ValueError(f"{field_name} 必须是整数") from e + + @staticmethod + def _coerce_optional_bool(value: Any, field_name: str) -> bool: + if isinstance(value, bool): + return value + if value in (None, ""): + return False + if isinstance(value, str): + lowered = value.strip().lower() + if lowered in {"true", "1", "yes", "on"}: + return True + if lowered in {"false", "0", "no", "off"}: + return False + raise ValueError(f"{field_name} 必须是布尔值") + + @staticmethod + def _validate_chunk_options( + *, + chunk_size: int | None, + chunk_overlap: int | None, + ) -> None: + if chunk_size is not None and chunk_size <= 0: + raise ValueError("chunk_size 必须大于 0") + if chunk_overlap is not None and chunk_overlap < 0: + raise ValueError("chunk_overlap 不能为负数") + if ( + chunk_size is not None + and chunk_overlap is not None + and chunk_overlap >= chunk_size + ): + raise ValueError("chunk_overlap 必须小于 chunk_size") + + @staticmethod + def _validate_positive_int(value: int | None, field_name: str) -> None: + if value is not None and value <= 0: + raise ValueError(f"{field_name} 必须大于 0") + + @classmethod + def _validate_kb_options( + cls, + *, + chunk_size: int | None, + chunk_overlap: int | None, + top_k_dense: int | None, + top_k_sparse: int | None, + top_m_final: int | None, + index_type: str | None, + ) -> None: + cls._validate_chunk_options( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + cls._validate_positive_int(top_k_dense, "top_k_dense") + cls._validate_positive_int(top_k_sparse, "top_k_sparse") + cls._validate_positive_int(top_m_final, "top_m_final") + if index_type is not None and index_type not in {"flat", "hnsw"}: + raise ValueError("index_type 必须是 flat 或 hnsw") + + @classmethod + def _validate_upload_options( + cls, + *, + chunk_size: int, + chunk_overlap: int, + batch_size: int, + tasks_limit: int, + max_retries: int, + ) -> None: + cls._validate_chunk_options( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + cls._validate_positive_int(batch_size, "batch_size") + cls._validate_positive_int(tasks_limit, "tasks_limit") + if max_retries < 0: + raise ValueError("max_retries 不能为负数") + + @staticmethod + def _validate_upload_file(file_name: str, file_size: int) -> None: + file_type = file_name.rsplit(".", 1)[-1].lower() if "." in file_name else "" + if file_type not in ALLOWED_UPLOAD_EXTENSIONS: + raise ValueError(f"不支持的文件类型: {file_name}") + if file_size > MAX_UPLOAD_FILE_SIZE: + limit = KnowledgeBaseRoute._format_size_limit(MAX_UPLOAD_FILE_SIZE) + raise ValueError(f"文件超过 {limit} 限制: {file_name}") + async def _background_upload_task( self, task_id: str, @@ -158,6 +411,7 @@ async def _background_upload_task( "current": 0, "total": 100, } + await self._persist_progress(task_id) uploaded_docs = [] failed_docs = [] @@ -174,6 +428,7 @@ async def _background_upload_task( current=0, total=100, ) + await self._persist_progress(task_id) # 创建进度回调函数 progress_callback = self._make_progress_callback( @@ -214,12 +469,42 @@ async def _background_upload_task( "failed_count": len(failed_docs), } - self._set_task_result(task_id, "completed", result=result) + task_status = self._resolve_batch_task_status( + len(uploaded_docs), + len(failed_docs), + ) + task_error = self._build_batch_failure_error( + failed_docs, + success_count=len(uploaded_docs), + action="上传", + ) + self._set_task_result( + task_id, + task_status, + result=result, + error=task_error, + ) + await self._update_persistent_task( + task_id, + status=task_status, + result=result, + error=task_error, + **self._get_persistent_progress_updates(task_id), + ) except Exception as e: logger.error(f"后台上传任务 {task_id} 失败: {e}") logger.error(traceback.format_exc()) self._set_task_result(task_id, "failed", error=str(e)) + await self._update_persistent_task( + task_id, + status="failed", + error=str(e), + **self._get_persistent_progress_updates(task_id), + ) + finally: + # 兜底清理:防止客户端不轮询 get_upload_progress 导致内存泄漏 + asyncio.create_task(self._schedule_delayed_cleanup(task_id)) async def _background_import_task( self, @@ -242,6 +527,7 @@ async def _background_import_task( "current": 0, "total": 100, } + await self._persist_progress(task_id) uploaded_docs = [] failed_docs = [] @@ -261,6 +547,7 @@ async def _background_import_task( current=0, total=100, ) + await self._persist_progress(task_id) # 创建进度回调函数 progress_callback = self._make_progress_callback( @@ -282,6 +569,8 @@ async def _background_import_task( max_retries=max_retries, progress_callback=progress_callback, pre_chunked_text=chunks, + source_type="import", + source_uri=file_name, ) uploaded_docs.append(doc.model_dump()) @@ -304,32 +593,327 @@ async def _background_import_task( "failed_count": len(failed_docs), } - self._set_task_result(task_id, "completed", result=result) + task_status = self._resolve_batch_task_status( + len(uploaded_docs), + len(failed_docs), + ) + task_error = self._build_batch_failure_error( + failed_docs, + success_count=len(uploaded_docs), + action="导入", + ) + self._set_task_result( + task_id, + task_status, + result=result, + error=task_error, + ) + await self._update_persistent_task( + task_id, + status=task_status, + result=result, + error=task_error, + **self._get_persistent_progress_updates(task_id), + ) except Exception as e: logger.error(f"后台导入任务 {task_id} 失败: {e}") logger.error(traceback.format_exc()) self._set_task_result(task_id, "failed", error=str(e)) + await self._update_persistent_task( + task_id, + status="failed", + error=str(e), + **self._get_persistent_progress_updates(task_id), + ) + finally: + asyncio.create_task(self._schedule_delayed_cleanup(task_id)) + + async def _background_rebuild_document_task( + self, + task_id: str, + kb_helper, + doc_id: str, + chunk_size: int | None, + chunk_overlap: int | None, + batch_size: int, + tasks_limit: int, + max_retries: int, + ) -> None: + """Run a single document rebuild in the background.""" + try: + self._init_task(task_id, status="processing") + self.upload_progress[task_id] = { + "status": "processing", + "file_index": 0, + "file_total": 1, + "file_name": doc_id, + "stage": "rebuilding", + "current": 0, + "total": 100, + } + await self._persist_progress(task_id) + + progress_callback = self._make_progress_callback(task_id, 0, doc_id) + doc = await kb_helper.rebuild_document( + doc_id, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + ) + + result = { + "task_id": task_id, + "rebuilt": [doc.model_dump()], + "failed": [], + "total": 1, + "success_count": 1, + "failed_count": 0, + } + self._update_progress( + task_id, + status="completed", + file_index=0, + file_name=doc_id, + stage="completed", + current=100, + total=100, + ) + self._set_task_result(task_id, "completed", result=result) + await self._update_persistent_task( + task_id, + status="completed", + result=result, + error=None, + **self._get_persistent_progress_updates(task_id), + ) + + except Exception as e: + logger.error(f"后台重建文档任务 {task_id} 失败: {e}") + logger.error(traceback.format_exc()) + self._set_task_result(task_id, "failed", error=str(e)) + await self._update_persistent_task( + task_id, + status="failed", + error=str(e), + **self._get_persistent_progress_updates(task_id), + ) + finally: + asyncio.create_task(self._schedule_delayed_cleanup(task_id)) + + async def _background_rebuild_kb_task( + self, + task_id: str, + kb_helper, + chunk_size: int | None, + chunk_overlap: int | None, + batch_size: int, + tasks_limit: int, + max_retries: int, + ) -> None: + """Run a full knowledge base rebuild in the background.""" + kb_name = getattr(getattr(kb_helper, "kb", None), "kb_name", "knowledge base") + try: + self._init_task(task_id, status="processing") + self.upload_progress[task_id] = { + "status": "processing", + "file_index": 0, + "file_total": 1, + "file_name": kb_name, + "stage": "rebuilding", + "current": 0, + "total": 100, + } + await self._persist_progress(task_id) + + progress_callback = self._make_progress_callback( + task_id, + 0, + kb_name, + ) + result = await kb_helper.rebuild_all_documents( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + ) + result = { + "task_id": task_id, + **result, + } + task_status = self._resolve_batch_task_status( + int(result.get("success_count") or 0), + int(result.get("failed_count") or 0), + ) + task_error = self._build_batch_failure_error( + result.get("failed") or [], + success_count=int(result.get("success_count") or 0), + action="重建", + ) + completed_total = max(int(result.get("total") or 0), 1) + self._update_progress( + task_id, + status=task_status, + file_index=0, + file_name=kb_name, + stage="completed", + current=completed_total, + total=completed_total, + ) + self._set_task_result( + task_id, + task_status, + result=result, + error=task_error, + ) + await self._update_persistent_task( + task_id, + status=task_status, + result=result, + error=task_error, + **self._get_persistent_progress_updates(task_id), + ) + + except Exception as e: + logger.error(f"后台重建知识库任务 {task_id} 失败: {e}") + logger.error(traceback.format_exc()) + self._set_task_result(task_id, "failed", error=str(e)) + await self._update_persistent_task( + task_id, + status="failed", + error=str(e), + **self._get_persistent_progress_updates(task_id), + ) + finally: + asyncio.create_task(self._schedule_delayed_cleanup(task_id)) + + async def _background_rebuild_documents_task( + self, + task_id: str, + kb_helper, + doc_ids: list[str], + chunk_size: int | None, + chunk_overlap: int | None, + batch_size: int, + tasks_limit: int, + max_retries: int, + ) -> None: + """Run selected document rebuilds in the background.""" + total = max(len(doc_ids), 1) + task_name = f"{len(doc_ids)} selected documents" + try: + self._init_task(task_id, status="processing") + self.upload_progress[task_id] = { + "status": "processing", + "file_index": 0, + "file_total": total, + "file_name": task_name, + "stage": "rebuilding", + "current": 0, + "total": total, + } + await self._persist_progress(task_id) + + progress_callback = self._make_progress_callback( + task_id, + 0, + task_name, + ) + result = await kb_helper.rebuild_documents( + doc_ids, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + progress_callback=progress_callback, + ) + result = { + "task_id": task_id, + **result, + } + task_status = self._resolve_batch_task_status( + int(result.get("success_count") or 0), + int(result.get("failed_count") or 0), + ) + task_error = self._build_batch_failure_error( + result.get("failed") or [], + success_count=int(result.get("success_count") or 0), + action="重建", + ) + completed_total = max(int(result.get("total") or 0), 1) + self._update_progress( + task_id, + status=task_status, + file_index=0, + file_name=task_name, + stage="completed", + current=completed_total, + total=completed_total, + ) + self._set_task_result( + task_id, + task_status, + result=result, + error=task_error, + ) + await self._update_persistent_task( + task_id, + status=task_status, + result=result, + error=task_error, + **self._get_persistent_progress_updates(task_id), + ) + + except Exception as e: + logger.error(f"后台批量重建文档任务 {task_id} 失败: {e}") + logger.error(traceback.format_exc()) + self._set_task_result(task_id, "failed", error=str(e)) + await self._update_persistent_task( + task_id, + status="failed", + error=str(e), + **self._get_persistent_progress_updates(task_id), + ) + finally: + asyncio.create_task(self._schedule_delayed_cleanup(task_id)) async def list_kbs(self): """获取知识库列表 Query 参数: - page: 页码 (默认 1) - - page_size: 每页数量 (默认 20) + - page_size: 每页数量 - refresh_stats: 是否刷新统计信息 (默认 false,首次加载时可设为 true) """ try: kb_manager = self._get_kb_manager() - page = request.args.get("page", 1, type=int) - page_size = request.args.get("page_size", 20, type=int) + page = self._get_positive_query_int("page", 1) + page_size = self._get_positive_query_int( + "page_size", + DEFAULT_KB_PAGE_SIZE, + ) + refresh_stats = request.args.get("refresh_stats") == "true" + kb_db = self._get_kb_db() kbs = await kb_manager.list_kbs() + total = len(kbs) + start = (page - 1) * page_size + paged_kbs = kbs[start : start + page_size] # 转换为字典列表 kb_list = [] - for kb in kbs: + for kb in paged_kbs: kb_dict = kb.model_dump() + if refresh_stats and kb_db and hasattr(kb_db, "get_kb_stats"): + stats = await kb_db.get_kb_stats(kb.kb_id) + if stats: + kb_dict.update(stats) # include init_error from KBHelper if present kb_helper = await kb_manager.get_kb(kb.kb_id) if kb_helper and kb_helper.init_error: @@ -338,7 +922,14 @@ async def list_kbs(self): return ( Response() - .ok({"items": kb_list, "page": page, "page_size": page_size}) + .ok( + { + "items": kb_list, + "page": page, + "page_size": page_size, + "total": total, + }, + ) .__dict__ ) except ValueError as e: @@ -374,11 +965,40 @@ async def create_kb(self): emoji = data.get("emoji") embedding_provider_id = data.get("embedding_provider_id") rerank_provider_id = data.get("rerank_provider_id") - chunk_size = data.get("chunk_size") - chunk_overlap = data.get("chunk_overlap") - top_k_dense = data.get("top_k_dense") - top_k_sparse = data.get("top_k_sparse") - top_m_final = data.get("top_m_final") + chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size") + chunk_overlap = self._coerce_optional_int( + data.get("chunk_overlap"), + "chunk_overlap", + ) + top_k_dense = self._coerce_optional_int( + data.get("top_k_dense"), + "top_k_dense", + ) + top_k_sparse = self._coerce_optional_int( + data.get("top_k_sparse"), + "top_k_sparse", + ) + top_m_final = self._coerce_optional_int( + data.get("top_m_final"), + "top_m_final", + ) + index_type = data.get("index_type") + self._validate_kb_options( + chunk_size=chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE, + chunk_overlap=chunk_overlap + if chunk_overlap is not None + else DEFAULT_CHUNK_OVERLAP, + top_k_dense=top_k_dense + if top_k_dense is not None + else DEFAULT_TOP_K_DENSE, + top_k_sparse=top_k_sparse + if top_k_sparse is not None + else DEFAULT_TOP_K_SPARSE, + top_m_final=top_m_final + if top_m_final is not None + else DEFAULT_TOP_M_FINAL, + index_type=index_type if index_type is not None else DEFAULT_INDEX_TYPE, + ) # pre-check embedding dim if not embedding_provider_id: @@ -433,6 +1053,7 @@ async def create_kb(self): top_k_dense=top_k_dense, top_k_sparse=top_k_sparse, top_m_final=top_m_final, + index_type=index_type, ) kb = kb_helper.kb @@ -495,34 +1116,72 @@ async def update_kb(self): if not kb_id: return Response().error("缺少参数 kb_id").__dict__ + update_fields = [ + "kb_name", + "description", + "emoji", + "embedding_provider_id", + "rerank_provider_id", + "chunk_size", + "chunk_overlap", + "top_k_dense", + "top_k_sparse", + "top_m_final", + "index_type", + ] + if not any(field in data for field in update_fields): + return Response().error("至少需要提供一个更新字段").__dict__ + kb_name = data.get("kb_name") description = data.get("description") emoji = data.get("emoji") embedding_provider_id = data.get("embedding_provider_id") - rerank_provider_id = data.get("rerank_provider_id") - chunk_size = data.get("chunk_size") - chunk_overlap = data.get("chunk_overlap") - top_k_dense = data.get("top_k_dense") - top_k_sparse = data.get("top_k_sparse") - top_m_final = data.get("top_m_final") - - # 检查是否至少提供了一个更新字段 - if all( - v is None - for v in [ - kb_name, - description, - emoji, - embedding_provider_id, - rerank_provider_id, - chunk_size, - chunk_overlap, - top_k_dense, - top_k_sparse, - top_m_final, - ] - ): - return Response().error("至少需要提供一个更新字段").__dict__ + rerank_provider_provided = "rerank_provider_id" in data + rerank_provider_id = ( + data.get("rerank_provider_id") if rerank_provider_provided else None + ) + chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size") + chunk_overlap = self._coerce_optional_int( + data.get("chunk_overlap"), + "chunk_overlap", + ) + top_k_dense = self._coerce_optional_int( + data.get("top_k_dense"), + "top_k_dense", + ) + top_k_sparse = self._coerce_optional_int( + data.get("top_k_sparse"), + "top_k_sparse", + ) + top_m_final = self._coerce_optional_int( + data.get("top_m_final"), + "top_m_final", + ) + index_type = data.get("index_type") + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + current_kb = kb_helper.kb + self._validate_kb_options( + chunk_size=chunk_size + if chunk_size is not None + else current_kb.chunk_size, + chunk_overlap=chunk_overlap + if chunk_overlap is not None + else current_kb.chunk_overlap, + top_k_dense=top_k_dense + if top_k_dense is not None + else current_kb.top_k_dense, + top_k_sparse=top_k_sparse + if top_k_sparse is not None + else current_kb.top_k_sparse, + top_m_final=top_m_final + if top_m_final is not None + else current_kb.top_m_final, + index_type=index_type + if index_type is not None + else current_kb.index_type, + ) kb_helper = await kb_manager.update_kb( kb_id=kb_id, @@ -530,12 +1189,17 @@ async def update_kb(self): description=description, emoji=emoji, embedding_provider_id=embedding_provider_id, - rerank_provider_id=rerank_provider_id, + **( + {"rerank_provider_id": rerank_provider_id} + if rerank_provider_provided + else {} + ), chunk_size=chunk_size, chunk_overlap=chunk_overlap, top_k_dense=top_k_dense, top_k_sparse=top_k_sparse, top_m_final=top_m_final, + index_type=index_type, ) if not kb_helper: @@ -594,12 +1258,28 @@ async def get_kb_stats(self): if not kb_helper: return Response().error("知识库不存在").__dict__ kb = kb_helper.kb + kb_db = self._get_kb_db() + if kb_db and hasattr(kb_db, "get_kb_stats"): + stats = await kb_db.get_kb_stats(kb_id) + if stats is not None: + return Response().ok(stats).__dict__ stats = { "kb_id": kb.kb_id, "kb_name": kb.kb_name, "doc_count": kb.doc_count, "chunk_count": kb.chunk_count, + "document_count": kb.doc_count, + "ready_document_count": kb.doc_count, + "failed_document_count": 0, + "pending_document_count": 0, + "processing_document_count": 0, + "indexed_chunk_count": kb.chunk_count, + "document_chunk_count": kb.chunk_count, + "media_count": 0, + "source_file_count": 0, + "storage_bytes": 0, + "status_counts": {"ready": kb.doc_count}, "created_at": kb.created_at.isoformat(), "updated_at": kb.updated_at.isoformat(), } @@ -613,38 +1293,121 @@ async def get_kb_stats(self): logger.error(traceback.format_exc()) return Response().error(f"获取知识库统计失败: {e!s}").__dict__ - # ===== 文档管理 API ===== - - async def list_documents(self): - """获取文档列表 - - Query 参数: - - kb_id: 知识库 ID (必填) - - page: 页码 (默认 1) - - page_size: 每页数量 (默认 20) - """ + async def check_kb_consistency(self): + """Check consistency across metadata, source files, and indexed chunks.""" try: kb_manager = self._get_kb_manager() kb_id = request.args.get("kb_id") if not kb_id: return Response().error("缺少参数 kb_id").__dict__ + kb_helper = await kb_manager.get_kb(kb_id) if not kb_helper: return Response().error("知识库不存在").__dict__ - page = request.args.get("page", 1, type=int) - page_size = request.args.get("page_size", 100, type=int) + report = await kb_helper.check_consistency() + return Response().ok(report).__dict__ - offset = (page - 1) * page_size - limit = page_size + except ValueError as e: + return Response().error(str(e)).__dict__ + except Exception as e: + logger.error(f"检查知识库一致性失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"检查知识库一致性失败: {e!s}").__dict__ - doc_list = await kb_helper.list_documents(offset=offset, limit=limit) + async def repair_kb_consistency(self): + """Repair low-risk consistency issues for a knowledge base.""" + try: + kb_manager = self._get_kb_manager() + data = await request.json + + kb_id = data.get("kb_id") + if not kb_id: + return Response().error("缺少参数 kb_id").__dict__ + + repair_types = data.get("repair_types") + if repair_types is not None and not isinstance(repair_types, list): + return Response().error("repair_types 格式错误").__dict__ + + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + + report = await kb_helper.repair_consistency(repair_types=repair_types) + return Response().ok(report).__dict__ + + except ValueError as e: + return Response().error(str(e)).__dict__ + except Exception as e: + logger.error(f"修复知识库一致性失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"修复知识库一致性失败: {e!s}").__dict__ + + # ===== 文档管理 API ===== + + async def list_documents(self): + """获取文档列表 + + Query 参数: + - kb_id: 知识库 ID (必填) + - page: 页码 (默认 1) + - page_size: 每页数量 + """ + try: + kb_manager = self._get_kb_manager() + kb_id = request.args.get("kb_id") + if not kb_id: + return Response().error("缺少参数 kb_id").__dict__ + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + + page = self._get_positive_query_int("page", 1) + page_size = self._get_positive_query_int( + "page_size", + DEFAULT_DOCUMENT_PAGE_SIZE, + ) + search = (request.args.get("search") or "").strip() or None + status = (request.args.get("status") or "").strip() or None + source_type = (request.args.get("source_type") or "").strip() or None + if status and status not in DOCUMENT_FILTER_STATUSES: + return Response().error("status 参数无效").__dict__ + if source_type and source_type not in DOCUMENT_FILTER_SOURCE_TYPES: + return Response().error("source_type 参数无效").__dict__ + + offset = (page - 1) * page_size + limit = page_size + + doc_list = await kb_helper.list_documents( + offset=offset, + limit=limit, + search=search, + status=status, + source_type=source_type, + ) + total = await kb_helper.count_documents( + search=search, + status=status, + source_type=source_type, + ) + document_count = total + if search is not None or status is not None or source_type is not None: + document_count = await kb_helper.count_documents() doc_list = [doc.model_dump() for doc in doc_list] return ( Response() - .ok({"items": doc_list, "page": page, "page_size": page_size}) + .ok( + { + "items": doc_list, + "page": page, + "page_size": page_size, + "total": total, + "filtered_total": total, + "document_count": document_count, + }, + ) .__dict__ ) @@ -683,9 +1446,9 @@ async def upload_document(self): kb_id = None chunk_size = None chunk_overlap = None - batch_size = 32 - tasks_limit = 3 - max_retries = 3 + batch_size = None + tasks_limit = None + max_retries = None files_to_upload = [] # 存储待上传的文件信息列表 if content_type and "multipart/form-data" not in content_type: @@ -696,11 +1459,46 @@ async def upload_document(self): files = await request.files kb_id = form_data.get("kb_id") - chunk_size = int(form_data.get("chunk_size", 512)) - chunk_overlap = int(form_data.get("chunk_overlap", 50)) - batch_size = int(form_data.get("batch_size", 32)) - tasks_limit = int(form_data.get("tasks_limit", 3)) - max_retries = int(form_data.get("max_retries", 3)) + chunk_size = self._coerce_optional_int( + form_data.get("chunk_size"), + "chunk_size", + ) + chunk_overlap = self._coerce_optional_int( + form_data.get("chunk_overlap"), + "chunk_overlap", + ) + batch_size = self._coerce_optional_int( + form_data.get("batch_size"), + "batch_size", + ) + tasks_limit = self._coerce_optional_int( + form_data.get("tasks_limit"), + "tasks_limit", + ) + max_retries = self._coerce_optional_int( + form_data.get("max_retries"), + "max_retries", + ) + chunk_size = chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE + chunk_overlap = ( + chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP + ) + batch_size = ( + batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE + ) + tasks_limit = ( + tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT + ) + max_retries = ( + max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES + ) + self._validate_upload_options( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + ) if not kb_id: return Response().error("缺少参数 kb_id").__dict__ @@ -716,8 +1514,10 @@ async def upload_document(self): return Response().error("缺少文件").__dict__ # 限制文件数量 - if len(file_list) > 10: - return Response().error("最多只能上传10个文件").__dict__ + if len(file_list) > MAX_UPLOAD_FILES: + return ( + Response().error(f"最多只能上传{MAX_UPLOAD_FILES}个文件").__dict__ + ) # 处理每个文件 for file in file_list: @@ -739,6 +1539,7 @@ async def upload_document(self): file_type = ( file_name.rsplit(".", 1)[-1].lower() if "." in file_name else "" ) + self._validate_upload_file(file_name, len(file_content)) files_to_upload.append( { @@ -762,6 +1563,20 @@ async def upload_document(self): # 初始化任务状态 self._init_task(task_id, status="pending") + await self._create_persistent_task( + task_id=task_id, + kb_id=kb_id, + task_type="upload", + status="pending", + progress={ + "status": "pending", + "file_index": 0, + "file_total": len(files_to_upload), + "stage": "waiting", + "current": 0, + "total": 100, + }, + ) # 启动后台任务 asyncio.create_task( @@ -815,9 +1630,20 @@ def _validate_import_request(self, data: dict): ): raise ValueError("chunks 必须是非空字符串列表") - batch_size = data.get("batch_size", 32) - tasks_limit = data.get("tasks_limit", 3) - max_retries = data.get("max_retries", 3) + batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size") + tasks_limit = self._coerce_optional_int(data.get("tasks_limit"), "tasks_limit") + max_retries = self._coerce_optional_int(data.get("max_retries"), "max_retries") + batch_size = batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE + tasks_limit = ( + tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT + ) + max_retries = ( + max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES + ) + self._validate_positive_int(batch_size, "batch_size") + self._validate_positive_int(tasks_limit, "tasks_limit") + if max_retries < 0: + raise ValueError("max_retries 不能为负数") return kb_id, documents, batch_size, tasks_limit, max_retries async def import_documents(self): @@ -851,6 +1677,20 @@ async def import_documents(self): # 初始化任务状态 self._init_task(task_id, status="pending") + await self._create_persistent_task( + task_id=task_id, + kb_id=kb_id, + task_type="import", + status="pending", + progress={ + "status": "pending", + "file_index": 0, + "file_total": len(documents), + "stage": "waiting", + "current": 0, + "total": 100, + }, + ) # 启动后台任务 asyncio.create_task( @@ -893,6 +1733,7 @@ async def get_upload_progress(self): - pending: 任务待处理 - processing: 任务处理中 - completed: 任务完成 + - partial_failed: 任务部分失败 - failed: 任务失败 """ try: @@ -902,7 +1743,27 @@ async def get_upload_progress(self): # 检查任务是否存在 if task_id not in self.upload_tasks: - return Response().error("找不到该任务").__dict__ + persistent_task = await self._get_persistent_task(task_id) + if persistent_task is None: + return Response().error("找不到该任务").__dict__ + response_data = { + "task_id": task_id, + "status": persistent_task["status"], + "progress_stage": persistent_task.get("progress_stage"), + "progress_current": persistent_task.get("progress_current", 0), + "progress_total": persistent_task.get("progress_total", 100), + } + if persistent_task.get("progress") is not None: + response_data["progress"] = persistent_task["progress"] + if persistent_task["status"] in ( + "completed", + "partial_failed", + "failed", + ): + response_data["result"] = persistent_task.get("result") + if persistent_task["status"] in ("partial_failed", "failed"): + response_data["error"] = persistent_task.get("error") + return Response().ok(response_data).__dict__ task_info = self.upload_tasks[task_id] status = task_info["status"] @@ -918,17 +1779,17 @@ async def get_upload_progress(self): response_data["progress"] = self.upload_progress[task_id] # 如果任务完成,返回结果 - if status == "completed": + if status in ("completed", "partial_failed", "failed"): response_data["result"] = task_info["result"] - # 清理已完成的任务 - # del self.upload_tasks[task_id] - # if task_id in self.upload_progress: - # del self.upload_progress[task_id] - # 如果任务失败,返回错误信息 - if status == "failed": + # 如果任务存在失败项,返回错误信息 + if status in ("partial_failed", "failed"): response_data["error"] = task_info["error"] + # 清理已结束的任务,释放内存 + if status in ("completed", "partial_failed", "failed"): + self._cleanup_task(task_id) + return Response().ok(response_data).__dict__ except Exception as e: @@ -936,6 +1797,69 @@ async def get_upload_progress(self): logger.error(traceback.format_exc()) return Response().error(f"获取上传进度失败: {e!s}").__dict__ + async def get_task(self): + """获取知识库持久任务详情""" + try: + task_id = request.args.get("task_id") + if not task_id: + return Response().error("缺少参数 task_id").__dict__ + + task = await self._get_persistent_task(task_id) + if not task: + return Response().error("任务不存在").__dict__ + return Response().ok(task).__dict__ + + except Exception as e: + logger.error(f"获取知识库任务失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"获取知识库任务失败: {e!s}").__dict__ + + async def list_tasks(self): + """列出知识库持久任务""" + try: + kb_db = self._get_kb_db() + if not kb_db: + return Response().error("知识库数据库未初始化").__dict__ + + page = self._get_positive_query_int("page", 1) + page_size = self._get_positive_query_int( + "page_size", + DEFAULT_DOCUMENT_PAGE_SIZE, + ) + kb_id = (request.args.get("kb_id") or "").strip() or None + status = (request.args.get("status") or "").strip() or None + task_type = (request.args.get("task_type") or "").strip() or None + + tasks = await kb_db.list_ingestion_tasks( + kb_id=kb_id, + status=status, + task_type=task_type, + offset=(page - 1) * page_size, + limit=page_size, + ) + total = await kb_db.count_ingestion_tasks( + kb_id=kb_id, + status=status, + task_type=task_type, + ) + return ( + Response() + .ok( + { + "items": tasks, + "total": total, + "page": page, + "page_size": page_size, + }, + ) + .__dict__ + ) + + except Exception as e: + logger.error(f"获取知识库任务列表失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"获取知识库任务列表失败: {e!s}").__dict__ + async def get_document(self): """获取文档详情 @@ -999,6 +1923,425 @@ async def delete_document(self): logger.error(traceback.format_exc()) return Response().error(f"删除文档失败: {e!s}").__dict__ + async def rebuild_document(self): + """重建单个文档""" + try: + kb_manager = self._get_kb_manager() + data = await request.json + + kb_id = data.get("kb_id") + if not kb_id: + return Response().error("缺少参数 kb_id").__dict__ + doc_id = data.get("doc_id") + if not doc_id: + return Response().error("缺少参数 doc_id").__dict__ + + chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size") + chunk_overlap = self._coerce_optional_int( + data.get("chunk_overlap"), + "chunk_overlap", + ) + batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size") + tasks_limit = self._coerce_optional_int( + data.get("tasks_limit"), + "tasks_limit", + ) + max_retries = self._coerce_optional_int( + data.get("max_retries"), + "max_retries", + ) + effective_chunk_size = ( + chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE + ) + effective_chunk_overlap = ( + chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP + ) + effective_batch_size = ( + batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE + ) + effective_tasks_limit = ( + tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT + ) + effective_max_retries = ( + max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES + ) + self._validate_upload_options( + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ) + background = self._coerce_optional_bool( + data.get("background"), + "background", + ) + + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + + if background: + task_id = str(uuid.uuid4()) + self._init_task(task_id, status="pending") + await self._create_persistent_task( + task_id=task_id, + kb_id=kb_id, + task_type="document_rebuild", + status="pending", + progress={ + "status": "pending", + "file_index": 0, + "file_total": 1, + "file_name": doc_id, + "stage": "waiting", + "current": 0, + "total": 100, + }, + ) + asyncio.create_task( + self._background_rebuild_document_task( + task_id=task_id, + kb_helper=kb_helper, + doc_id=doc_id, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ), + ) + return ( + Response() + .ok( + { + "task_id": task_id, + "doc_id": doc_id, + "message": ( + "document rebuild task created, " + "processing in background" + ), + }, + ) + .__dict__ + ) + + doc = await kb_helper.rebuild_document( + doc_id, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ) + return Response().ok(doc.model_dump(), "重建文档成功").__dict__ + + except ValueError as e: + return Response().error(str(e)).__dict__ + except Exception as e: + logger.error(f"重建文档失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"重建文档失败: {e!s}").__dict__ + + async def rebuild_kb(self): + """重建整个知识库""" + try: + kb_manager = self._get_kb_manager() + data = await request.json + + kb_id = data.get("kb_id") + if not kb_id: + return Response().error("缺少参数 kb_id").__dict__ + + chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size") + chunk_overlap = self._coerce_optional_int( + data.get("chunk_overlap"), + "chunk_overlap", + ) + batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size") + tasks_limit = self._coerce_optional_int( + data.get("tasks_limit"), + "tasks_limit", + ) + max_retries = self._coerce_optional_int( + data.get("max_retries"), + "max_retries", + ) + effective_chunk_size = ( + chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE + ) + effective_chunk_overlap = ( + chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP + ) + effective_batch_size = ( + batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE + ) + effective_tasks_limit = ( + tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT + ) + effective_max_retries = ( + max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES + ) + self._validate_upload_options( + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ) + background = self._coerce_optional_bool( + data.get("background"), + "background", + ) + + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + + if background: + kb_name = getattr( + getattr(kb_helper, "kb", None), + "kb_name", + "knowledge base", + ) + task_id = str(uuid.uuid4()) + self._init_task(task_id, status="pending") + await self._create_persistent_task( + task_id=task_id, + kb_id=kb_id, + task_type="kb_rebuild", + status="pending", + progress={ + "status": "pending", + "file_index": 0, + "file_total": 1, + "file_name": kb_name, + "stage": "waiting", + "current": 0, + "total": 100, + }, + ) + asyncio.create_task( + self._background_rebuild_kb_task( + task_id=task_id, + kb_helper=kb_helper, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ), + ) + return ( + Response() + .ok( + { + "task_id": task_id, + "kb_id": kb_id, + "message": ( + "knowledge base rebuild task created, " + "processing in background" + ), + }, + ) + .__dict__ + ) + + result = await kb_helper.rebuild_all_documents( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ) + return Response().ok(result, "重建知识库完成").__dict__ + + except ValueError as e: + return Response().error(str(e)).__dict__ + except Exception as e: + logger.error(f"重建知识库失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"重建知识库失败: {e!s}").__dict__ + + async def batch_rebuild_documents(self): + """Start a background task to rebuild selected documents. + + Body: + - kb_id: knowledge base ID (required) + - doc_ids: document ID list (required) + """ + try: + kb_manager = self._get_kb_manager() + data = await request.json + + kb_id = data.get("kb_id") + if not kb_id: + return Response().error("缺少参数 kb_id").__dict__ + doc_ids = data.get("doc_ids") + if not doc_ids or not isinstance(doc_ids, list): + return Response().error("缺少参数 doc_ids 或格式错误").__dict__ + normalized_doc_ids = list( + dict.fromkeys( + doc_id.strip() + for doc_id in doc_ids + if isinstance(doc_id, str) and doc_id.strip() + ) + ) + if not normalized_doc_ids: + return Response().error("缺少参数 doc_ids 或格式错误").__dict__ + if len(normalized_doc_ids) > MAX_BATCH_REBUILD_DOCUMENTS: + return ( + Response() + .error(f"最多只能批量重建 {MAX_BATCH_REBUILD_DOCUMENTS} 个文档") + .__dict__ + ) + + chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size") + chunk_overlap = self._coerce_optional_int( + data.get("chunk_overlap"), + "chunk_overlap", + ) + batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size") + tasks_limit = self._coerce_optional_int( + data.get("tasks_limit"), + "tasks_limit", + ) + max_retries = self._coerce_optional_int( + data.get("max_retries"), + "max_retries", + ) + effective_chunk_size = ( + chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE + ) + effective_chunk_overlap = ( + chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP + ) + effective_batch_size = ( + batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE + ) + effective_tasks_limit = ( + tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT + ) + effective_max_retries = ( + max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES + ) + self._validate_upload_options( + chunk_size=effective_chunk_size, + chunk_overlap=effective_chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ) + + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + + task_id = str(uuid.uuid4()) + self._init_task(task_id, status="pending") + await self._create_persistent_task( + task_id=task_id, + kb_id=kb_id, + task_type="document_batch_rebuild", + status="pending", + progress={ + "status": "pending", + "file_index": 0, + "file_total": len(normalized_doc_ids), + "file_name": f"{len(normalized_doc_ids)} selected documents", + "stage": "waiting", + "current": 0, + "total": len(normalized_doc_ids), + }, + ) + asyncio.create_task( + self._background_rebuild_documents_task( + task_id=task_id, + kb_helper=kb_helper, + doc_ids=normalized_doc_ids, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=effective_batch_size, + tasks_limit=effective_tasks_limit, + max_retries=effective_max_retries, + ), + ) + return ( + Response() + .ok( + { + "task_id": task_id, + "doc_ids": normalized_doc_ids, + "message": ( + "document batch rebuild task created, " + "processing in background" + ), + }, + ) + .__dict__ + ) + + except ValueError as e: + return Response().error(str(e)).__dict__ + except Exception as e: + logger.error(f"批量重建文档失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"批量重建文档失败: {e!s}").__dict__ + + async def batch_delete_documents(self): + """批量删除文档 + + Body: + - kb_id: 知识库 ID (必填) + - doc_ids: 文档 ID 列表 (必填, 最多 100 个) + """ + try: + kb_manager = self._get_kb_manager() + data = await request.json + + kb_id = data.get("kb_id") + if not kb_id: + return Response().error("缺少参数 kb_id").__dict__ + doc_ids = data.get("doc_ids") + if not doc_ids or not isinstance(doc_ids, list): + return Response().error("缺少参数 doc_ids 或格式错误").__dict__ + if len(doc_ids) > MAX_BATCH_DELETE_DOCUMENTS: + return ( + Response() + .error(f"最多只能批量删除 {MAX_BATCH_DELETE_DOCUMENTS} 个文档") + .__dict__ + ) + + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + + results = await kb_helper.delete_documents(doc_ids) + + success_count = sum(1 for v in results.values() if v) + failed_count = len(doc_ids) - success_count + + return ( + Response() + .ok( + { + "results": results, + "total": len(doc_ids), + "success_count": success_count, + "failed_count": failed_count, + }, + "批量删除完成", + ) + .__dict__ + ) + + except ValueError as e: + return Response().error(str(e)).__dict__ + except Exception as e: + logger.error(f"批量删除文档失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"批量删除文档失败: {e!s}").__dict__ + async def delete_chunk(self): """删除文本块 @@ -1040,14 +2383,18 @@ async def list_chunks(self): Query 参数: - kb_id: 知识库 ID (必填) - page: 页码 (默认 1) - - page_size: 每页数量 (默认 20) + - page_size: 每页数量 """ try: kb_manager = self._get_kb_manager() kb_id = request.args.get("kb_id") doc_id = request.args.get("doc_id") - page = request.args.get("page", 1, type=int) - page_size = request.args.get("page_size", 100, type=int) + page = self._get_positive_query_int("page", 1) + page_size = self._get_positive_query_int( + "page_size", + DEFAULT_CHUNK_PAGE_SIZE, + ) + search = (request.args.get("search") or "").strip() or None if not kb_id: return Response().error("缺少参数 kb_id").__dict__ if not doc_id: @@ -1057,11 +2404,15 @@ async def list_chunks(self): limit = page_size if not kb_helper: return Response().error("知识库不存在").__dict__ - chunk_list = await kb_helper.get_chunks_by_doc_id( + chunk_list, total = await kb_helper.search_chunks_by_doc_id( doc_id=doc_id, + search=search, offset=offset, limit=limit, ) + document_chunk_count = total + if search is not None: + document_chunk_count = await kb_helper.get_chunk_count_by_doc_id(doc_id) return ( Response() .ok( @@ -1069,7 +2420,9 @@ async def list_chunks(self): "items": chunk_list, "page": page, "page_size": page_size, - "total": await kb_helper.get_chunk_count_by_doc_id(doc_id), + "total": total, + "filtered_total": total, + "document_chunk_count": document_chunk_count, }, ) .__dict__ @@ -1081,6 +2434,41 @@ async def list_chunks(self): logger.error(traceback.format_exc()) return Response().error(f"获取块列表失败: {e!s}").__dict__ + async def get_chunk_context(self): + """获取文本块和相邻上下文块 + + Query 参数: + - kb_id: 知识库 ID (必填) + - doc_id: 文档 ID (必填) + - chunk_id: 文本块 ID (必填) + """ + try: + kb_manager = self._get_kb_manager() + kb_id = request.args.get("kb_id") + doc_id = request.args.get("doc_id") + chunk_id = request.args.get("chunk_id") + if not kb_id: + return Response().error("缺少参数 kb_id").__dict__ + if not doc_id: + return Response().error("缺少参数 doc_id").__dict__ + if not chunk_id: + return Response().error("缺少参数 chunk_id").__dict__ + + kb_helper = await kb_manager.get_kb(kb_id) + if not kb_helper: + return Response().error("知识库不存在").__dict__ + context = await kb_helper.get_chunk_context( + chunk_id=chunk_id, + doc_id=doc_id, + ) + return Response().ok(data=context).__dict__ + except ValueError as e: + return Response().error(str(e)).__dict__ + except Exception as e: + logger.error(f"获取文本块上下文失败: {e}") + logger.error(traceback.format_exc()) + return Response().error(f"获取文本块上下文失败: {e!s}").__dict__ + # ===== 检索 API ===== async def retrieve(self): @@ -1097,20 +2485,35 @@ async def retrieve(self): data = await request.json query = data.get("query") + kb_ids = data.get("kb_ids") kb_names = data.get("kb_names") - debug = data.get("debug", False) + debug = self._coerce_optional_bool(data.get("debug", False), "debug") + trace = self._coerce_optional_bool(data.get("trace", False), "trace") if not query: return Response().error("缺少参数 query").__dict__ - if not kb_names or not isinstance(kb_names, list): - return Response().error("缺少参数 kb_names 或格式错误").__dict__ - - top_k = data.get("top_k", 5) + if kb_ids is not None and not isinstance(kb_ids, list): + return Response().error("参数 kb_ids 格式错误").__dict__ + if kb_names is not None and not isinstance(kb_names, list): + return Response().error("参数 kb_names 格式错误").__dict__ + if not kb_ids and not kb_names: + return Response().error("缺少参数 kb_ids 或 kb_names").__dict__ + + top_k = self._coerce_optional_int( + data.get("top_k", DEFAULT_TOP_M_FINAL), + "top_k", + ) + top_k = top_k if top_k is not None else DEFAULT_TOP_M_FINAL + self._validate_positive_int(top_k, "top_k") + if top_k > MAX_RETRIEVE_TOP_K: + return Response().error(f"top_k 不能大于 {MAX_RETRIEVE_TOP_K}").__dict__ results = await kb_manager.retrieve( query=query, kb_names=kb_names, + kb_ids=kb_ids, top_m_final=top_k, + include_trace=trace or debug, ) result_list = [] if results: @@ -1121,13 +2524,21 @@ async def retrieve(self): "total": len(result_list), "query": query, } + if results and "trace" in results: + response_data["trace"] = results["trace"] # Debug 模式:生成 t-SNE 可视化 if debug: try: + visualization_kb_names = kb_names + if not visualization_kb_names and kb_ids: + visualization_kb_names = [] + for kb_id in kb_ids: + if kb_helper := await kb_manager.get_kb(kb_id): + visualization_kb_names.append(kb_helper.kb.kb_name) img_base64 = await generate_tsne_visualization( query, - kb_names, + visualization_kb_names or [], kb_manager, ) if img_base64: @@ -1173,11 +2584,40 @@ async def upload_document_from_url(self): if not url: return Response().error("缺少参数 url").__dict__ - chunk_size = data.get("chunk_size", 512) - chunk_overlap = data.get("chunk_overlap", 50) - batch_size = data.get("batch_size", 32) - tasks_limit = data.get("tasks_limit", 3) - max_retries = data.get("max_retries", 3) + chunk_size = self._coerce_optional_int(data.get("chunk_size"), "chunk_size") + chunk_overlap = self._coerce_optional_int( + data.get("chunk_overlap"), + "chunk_overlap", + ) + batch_size = self._coerce_optional_int(data.get("batch_size"), "batch_size") + tasks_limit = self._coerce_optional_int( + data.get("tasks_limit"), + "tasks_limit", + ) + max_retries = self._coerce_optional_int( + data.get("max_retries"), + "max_retries", + ) + chunk_size = chunk_size if chunk_size is not None else DEFAULT_CHUNK_SIZE + chunk_overlap = ( + chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP + ) + batch_size = ( + batch_size if batch_size is not None else DEFAULT_UPLOAD_BATCH_SIZE + ) + tasks_limit = ( + tasks_limit if tasks_limit is not None else DEFAULT_UPLOAD_TASKS_LIMIT + ) + max_retries = ( + max_retries if max_retries is not None else DEFAULT_UPLOAD_MAX_RETRIES + ) + self._validate_upload_options( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + batch_size=batch_size, + tasks_limit=tasks_limit, + max_retries=max_retries, + ) enable_cleaning = data.get("enable_cleaning", False) cleaning_provider_id = data.get("cleaning_provider_id") @@ -1191,6 +2631,21 @@ async def upload_document_from_url(self): # 初始化任务状态 self._init_task(task_id, status="pending") + await self._create_persistent_task( + task_id=task_id, + kb_id=kb_id, + task_type="url", + status="pending", + progress={ + "status": "pending", + "file_index": 0, + "file_total": 1, + "file_name": f"URL: {url}", + "stage": "waiting", + "current": 0, + "total": 100, + }, + ) # 启动后台任务 asyncio.create_task( @@ -1253,6 +2708,7 @@ async def _background_upload_from_url_task( "current": 0, "total": 100, } + await self._persist_progress(task_id) # 创建进度回调函数 progress_callback = self._make_progress_callback(task_id, 0, f"URL: {url}") @@ -1281,8 +2737,23 @@ async def _background_upload_from_url_task( } self._set_task_result(task_id, "completed", result=result) + await self._update_persistent_task( + task_id, + status="completed", + result=result, + error=None, + **self._get_persistent_progress_updates(task_id), + ) except Exception as e: logger.error(f"后台上传URL任务 {task_id} 失败: {e}") logger.error(traceback.format_exc()) self._set_task_result(task_id, "failed", error=str(e)) + await self._update_persistent_task( + task_id, + status="failed", + error=str(e), + **self._get_persistent_progress_updates(task_id), + ) + finally: + asyncio.create_task(self._schedule_delayed_cleanup(task_id)) diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json index 78a00669e3..d8df20cc68 100644 --- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json +++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/detail.json @@ -21,36 +21,183 @@ "stats": "Statistics", "docCount": "Documents", "chunkCount": "Chunks", + "readyDocCount": "Ready Documents", + "failedDocCount": "Failed Documents", + "sourceFiles": "Source Files", + "storageUsed": "Storage Used", "embeddingModel": "Embedding Model", "rerankModel": "Rerank Model", "notSet": "Not Set" }, + "consistency": { + "title": "Index Consistency", + "run": "Run Check", + "repair": "Repair Fixable Issues", + "notRun": "No consistency check has been run yet. Run a check to compare document metadata, source files, and indexed chunks.", + "notRunHint": "A full check reads index metadata and lists fixable issues.", + "notRunChunkMismatch": "Current snapshot has {metadata} document chunks but {indexed} indexed chunks. Run a check.", + "notRunFailedDocs": "{count} documents are failed. Review the document list or run a consistency check.", + "healthy": "No consistency issues found", + "unhealthy": "{count} consistency issues found", + "checkedAt": "Checked at: {time}", + "sqliteDocuments": "Metadata Documents", + "indexedChunks": "Indexed Chunks", + "documentChunks": "Document Chunks", + "sourceFiles": "Source Files", + "expectedChunks": "{count} expected chunks", + "actualChunks": "{count} actual chunks", + "checkSuccessHealthy": "Consistency check completed with no issues", + "checkSuccessUnhealthy": "Consistency check completed with {count} issues", + "checkFailed": "Consistency check failed", + "repairSuccess": "Consistency repair completed: {repaired} repaired, {skipped} skipped", + "repairPartialSuccess": "Consistency repair partially completed: {repaired} repaired, {skipped} skipped, {failed} failed", + "repairFailed": "Consistency repair failed", + "issues": { + "missingVectors": "Documents Missing Indexed Chunks", + "orphanVectors": "Orphan Indexed Chunks", + "missingSourceFiles": "Missing Source Files", + "chunkCountMismatches": "Chunk Count Mismatches", + "invalidVectorMetadata": "Invalid Index Metadata", + "unsafeSourcePaths": "Unsafe Source Paths" + }, + "reasons": { + "empty_file_path": "Source file path is empty", + "outside_kb_files_dir": "Source file path is outside the knowledge base directory", + "not_found": "Source file does not exist" + } + }, + "maintenance": { + "rebuild": "Rebuild Index", + "rebuildStarted": "Knowledge base rebuild started", + "rebuildSuccess": "Knowledge base rebuild completed", + "rebuildFailed": "Failed to rebuild knowledge base", + "rebuildFailedWithReason": "Failed to rebuild knowledge base: {reason}", + "rebuildPartialSuccess": "Knowledge base rebuild partially completed: {success} succeeded, {failed} failed", + "unknownError": "Unknown error", + "stages": { + "waiting": "Waiting...", + "rebuilding": "Rebuilding knowledge base...", + "parsing": "Parsing document...", + "chunking": "Chunking text...", + "embedding": "Generating embeddings...", + "completed": "Completed" + } + }, + "tasks": { + "title": "Recent Tasks", + "refresh": "Refresh tasks", + "empty": "No task records yet", + "loadFailed": "Failed to load recent tasks", + "recentFailures": "Recent failures", + "noErrorMessage": "No error message", + "resultSummary": "{total} total, {success} succeeded, {failed} failed", + "progressDetail": "Progress {progress}", + "types": { + "upload": "Document Upload", + "import": "Document Import", + "url": "URL Import", + "document_rebuild": "Document Rebuild", + "document_batch_rebuild": "Batch Document Rebuild", + "kb_rebuild": "Knowledge Base Rebuild" + }, + "statuses": { + "pending": "Pending", + "processing": "Processing", + "completed": "Completed", + "partial_failed": "Partially failed", + "failed": "Failed" + } + }, "documents": { "title": "Documents", "upload": "Upload Document", "empty": "No documents", + "searchPlaceholder": "Search documents...", + "statusFilter": "Status", + "sourceFilter": "Source", + "allStatuses": "All Statuses", + "allSources": "All Sources", + "filteredCount": "Showing {filtered} / {total} documents", "name": "Name", "type": "Type", + "status": "Status", "size": "Size", "chunks": "Chunks", "createdAt": "Uploaded At", "actions": "Actions", "view": "View", + "copyFailure": "Copy Failure Diagnostics", + "rebuild": "Retry Rebuild", "delete": "Delete", + "rebuildTitle": "Rebuild Document Index", + "rebuildConfirm": "Rebuild the index for document '{name}'?", + "rebuildWarning": "Rebuild will parse and write the index again. The previous index may still be used until the task finishes.", + "batchRebuild": "Rebuild Selected ({count})", + "batchRebuildTitle": "Rebuild Selected Documents", + "batchRebuildConfirm": "Rebuild the index for the {count} selected documents?", + "batchRebuildMore": "{count} more", + "batchRebuildWarning": "Batch rebuild will parse and write indexes for the selected documents again. Previous indexes may still be used until the task finishes.", + "batchDelete": "Delete Selected ({count})", + "batchDeleteTitle": "Delete Selected Documents", + "batchDeleteConfirm": "Delete the {count} selected documents?", + "batchDeleteMore": "{count} more", + "cancel": "Cancel", "deleteConfirm": "Are you sure you want to delete document '{name}'?", "deleteWarning": "This will delete the document and all its chunks. This action cannot be undone.", "uploading": "Uploading...", "uploadSuccess": "Document uploaded successfully", "uploadFailed": "Failed to upload document", + "loadFailed": "Failed to load documents", "deleteSuccess": "Document deleted successfully", - "deleteFailed": "Failed to delete document" + "deleteFailed": "Failed to delete document", + "batchDeleteSuccess": "{count} documents deleted", + "batchDeletePartialSuccess": "Batch delete partially completed: {success} succeeded, {failed} failed", + "batchDeleteFailed": "Failed to batch delete documents", + "batchDeleteLimitExceeded": "You can delete up to {limit} documents at once", + "batchRebuildStarted": "Started rebuilding {count} documents", + "batchRebuildFailed": "Failed to batch rebuild documents", + "batchRebuildLimitExceeded": "You can rebuild up to {limit} documents at once", + "failureDocument": "Document", + "failureDocumentId": "Document ID", + "failureStage": "Failure Stage", + "failureMessage": "Error Message", + "unknownFailureStage": "Unknown Stage", + "noFailureMessage": "No error message", + "copyFailureSuccess": "Failure diagnostics copied", + "copyFailureFailed": "Failed to copy failure diagnostics", + "rebuildStarted": "Document rebuild started", + "rebuildSuccess": "Document rebuilt successfully", + "rebuildFailed": "Failed to rebuild document", + "rebuildFailedWithReason": "Failed to rebuild document: {reason}", + "rebuildPartialSuccess": "Document rebuild partially completed: {success} succeeded, {failed} failed", + "statuses": { + "pending": "Pending", + "parsing": "Parsing", + "chunking": "Chunking", + "embedding": "Indexing", + "ready": "Ready", + "failed": "Failed" + }, + "sourceTypes": { + "file": "File", + "url": "URL", + "import": "Import" + } }, "upload": { "title": "Upload Document", "selectFile": "Select File", "dropzone": "Drop files here or click to select", - "supportedFormats": "Supported formats: .txt, .md, .markdown, .rst, .adoc, .pdf, .docx, .epub, .xls, .xlsx", - "maxSize": "Max file size: 128MB", + "supportedFormats": "Supported formats: {formats}", + "maxSize": "Max file size: {size}", + "maxFiles": "Upload up to {count} files", + "maxFilesWarning": "You can select up to {count} files", + "selectedFiles": "{count} files selected", + "clear": "Clear", + "someFilesRejected": "Some files were not added", + "unsupportedFile": "{name}: unsupported file type", + "fileTooLarge": "{name}: file exceeds {size}", + "invalidSettings": "Please check the upload settings", "chunkSettings": "Chunk Settings", "batchSettings": "Batch Settings", "cleaningSettings": "Cleaning Settings", @@ -58,15 +205,15 @@ "cleaningProvider": "Cleaning Service Provider", "cleaningProviderHint": "Select an LLM provider to clean and summarize the extracted web page content", "chunkSize": "Chunk Size", - "chunkSizeHint": "Number of characters per chunk (default: 512)", + "chunkSizeHint": "Number of characters per chunk (default: {value})", "chunkOverlap": "Chunk Overlap", - "chunkOverlapHint": "Overlapping characters between chunks (default: 50)", + "chunkOverlapHint": "Overlapping characters between chunks (default: {value})", "batchSize": "Batch Size", - "batchSizeHint": "Number of chunks to process in each batch (default: 32)", + "batchSizeHint": "Number of chunks to process in each batch (default: {value})", "tasksLimit": "Concurrent Tasks Limit", - "tasksLimitHint": "Maximum number of concurrent upload tasks (default: 3)", + "tasksLimitHint": "Maximum number of concurrent upload tasks (default: {value})", "maxRetries": "Max Retries", - "maxRetriesHint": "Number of times to retry a failed upload task (default: 3)", + "maxRetriesHint": "Number of times to retry a failed upload task (default: {value})", "cancel": "Cancel", "submit": "Upload", "fileRequired": "Please select a file to upload", @@ -75,6 +222,27 @@ "urlPlaceholder": "Enter the URL of the web page to extract content from", "urlRequired": "Please enter a URL", "urlHint": "The main content will be automatically extracted from the target URL as a document. Currently supports {supported} pages. Before use, please ensure that the target web page allows crawler access.", + "unsupportedUrlImport": "URL import is not enabled by the backend", + "tavilyCheckFailed": "Failed to check web search configuration", + "tavilyRequired": "Tavily Key is required for this feature", + "configure": "Configure", + "tavilyConfigured": "Tavily API Key configured", + "backgroundUploading": "Uploading {count} files in the background...", + "backgroundUrlUploading": "Extracting URL content in the background...", + "successCount": "Successfully uploaded {count} documents", + "partialSuccess": "Upload finished: {success} succeeded, {failed} failed", + "failedWithReason": "Upload failed: {reason}", + "unknownError": "Unknown error", + "stages": { + "waiting": "Waiting...", + "extracting": "Extracting content...", + "cleaning": "Cleaning content...", + "parsing": "Parsing document...", + "chunking": "Chunking text...", + "embedding": "Generating embeddings...", + "rebuilding": "Rebuilding document...", + "completed": "Completed" + }, "beta": "Beta" }, "retrieval": { @@ -88,6 +256,14 @@ "noResults": "No results found", "tryDifferentQuery": "Try a different query", "settings": "Retrieval Settings", + "debugMode": "Debug Mode", + "debugModeTsne": "Debug Mode (t-SNE)", + "traceMode": "Retrieval Trace", + "cancel": "Cancel", + "caseNotesPlaceholder": "Example: sparse retrieval ranked too low", + "caseTags": "Tags", + "caseTagsPlaceholder": "Example: manual, retrieval-ui, bad-case", + "tsneVisualization": "t-SNE Visualization", "topK": "Number of Results", "topKHint": "Maximum number of results to return", "enableRerank": "Enable Rerank", @@ -97,9 +273,40 @@ "chunk": "Chunk #{index}", "content": "Content", "charCount": "{count} characters", + "traceTitle": "Retrieval Trace", + "traceStageCount": "{count} stages", + "traceHits": "{count} hits", + "traceDenseRank": "Dense rank #{rank}", + "traceSparseRank": "Sparse rank #{rank}", + "traceDenseScore": "Dense score", + "traceSparseScore": "Sparse score", + "traceRrfScore": "RRF score", + "traceRerankScore": "Rerank score", + "traceDuplicateOf": "Duplicate of {chunk}", + "traceDedupSimilarity": "Duplicate similarity {value}", + "sourcePage": "Page {page}", + "sourceSection": "Section {index}", + "sourceParentChunk": "Parent chunk {id}", + "tracePreviewEmpty": "No content preview", + "traceEmpty": "No candidates in this stage", + "unknownDocument": "Unknown document", + "traceStages": { + "dense": "Dense Recall", + "sparse": "Sparse Recall", + "fusion": "RRF Fusion", + "dedup": "Near-Duplicate Removal", + "dedup_removed": "Removed Duplicates", + "rerank": "Rerank", + "final": "Final Context" + }, "searchSuccess": "Search completed, found {count} results", "searchFailed": "Search failed", - "queryRequired": "Please enter a query" + "queryRequired": "Please enter a query", + "latestRunResults": "Latest Results", + "metricRecall": "Recall", + "metricNdcg": "nDCG", + "metricPrecision": "Precision", + "metricFirstHit": "First Hit" }, "settings": { "title": "Knowledge Base Settings", @@ -113,9 +320,43 @@ "enableRerank": "Enable Rerank", "embeddingProvider": "Embedding Provider", "rerankProvider": "Rerank Provider", + "embeddingProviderHint": "The embedding model is bound to the current vector index. Create a new knowledge base to change it.", + "indexType": "Index Type", + "indexTypeHint": "Flat is exact; HNSW is better for larger knowledge bases.", + "indexTypes": { + "flat": "Flat exact index", + "hnsw": "HNSW approximate index" + }, "save": "Save Settings", "saveSuccess": "Settings saved successfully", "saveFailed": "Failed to save settings", + "providersLoadFailed": "Failed to load model providers", "tips": "Tip: Modifying retrieval settings will affect subsequent knowledge base queries." + }, + "validation": { + "integer": "Enter an integer", + "positiveInteger": "Enter an integer greater than 0", + "nonNegativeInteger": "Enter an integer no less than 0", + "overlapLessThanSize": "Chunk overlap must be less than chunk size", + "topKRange": "Number of results must be an integer from 1 to {max}" + }, + "actions": { + "retry": "Retry" + }, + "messages": { + "loadFailed": "Failed to load knowledge base details" + }, + "tavily": { + "title": "Configure Tavily API Key", + "description": "A Tavily API Key is required to use web-based knowledge base features. You can get one from", + "officialSite": "Tavily", + "apiKeyLabel": "Tavily API Key", + "apiKeyPlaceholder": "tvly-...", + "cancel": "Cancel", + "save": "Save", + "keyRequired": "API Key is required", + "loadConfigFailed": "Failed to load current configuration", + "saveFailed": "Failed to save. Please check the key.", + "unknownSaveFailed": "Failed to save due to an unknown error" } } diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json index d3a3b65c9a..dbdca2bf67 100644 --- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json +++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/document.json @@ -9,12 +9,48 @@ "chunkCount": "Chunk Count", "createdAt": "Uploaded At" }, + "processing": { + "title": "Processing Information", + "status": "Status", + "sourceType": "Source Type", + "sourceUri": "Source URI", + "contentHash": "Content Hash", + "parser": "Parser", + "chunker": "Chunker", + "version": "Version", + "parentDocId": "Parent Document ID", + "indexedAt": "Indexed At", + "unknownStage": "Unknown Stage", + "noErrorMessage": "No error message", + "statuses": { + "pending": "Pending", + "parsing": "Parsing", + "chunking": "Chunking", + "embedding": "Indexing", + "ready": "Ready", + "failed": "Failed" + }, + "sourceTypes": { + "file": "File", + "url": "URL", + "import": "Import", + "api": "API" + } + }, "chunks": { "title": "Chunks", + "total": "{count} chunks", + "filteredTotal": "{filtered} / {total} matching chunks", "empty": "No chunks", "index": "Index", "content": "Content", + "titlePath": "Title Path", "charCount": "Characters", + "charCountValue": "{count} characters", + "tokenEstimate": "Estimated Tokens", + "tokenEstimateValue": "About {count} tokens", + "offset": "Offset", + "contentHash": "Content Hash", "actions": "Actions", "view": "View", "edit": "Edit", @@ -23,6 +59,7 @@ "search": "Search Chunks", "searchPlaceholder": "Enter keywords to search chunks...", "showing": "Showing", + "showingRange": "Showing {start} - {end} / {total} chunks", "deleteConfirm": "Are you sure you want to delete this chunk?", "deleteSuccess": "Chunk deleted successfully", "deleteFailed": "Failed to delete chunk" @@ -49,7 +86,39 @@ "index": "Index", "content": "Content", "charCount": "Characters", + "tokenEstimate": "Estimated Tokens", + "titlePath": "Title Path", + "section": "Section", + "pageNumber": "Page", + "offset": "Offset", + "contentHash": "Content Hash", + "adjacentChunks": "Adjacent Chunks", + "previousChunk": "Previous: {id}", + "nextChunk": "Next: {id}", + "parentChunk": "Parent Chunk", "vecDocId": "Vector ID", + "context": "Adjacent Context", + "previous": "Previous", + "current": "Current", + "next": "Next", + "contextMissing": "No adjacent chunk", "close": "Close" + }, + "actions": { + "retry": "Retry", + "retryRebuild": "Retry Rebuild", + "retryRebuildConfirm": "Rebuild the index for this document?" + }, + "messages": { + "loadDocumentFailed": "Failed to load document details", + "loadChunksFailed": "Failed to load chunks", + "loadChunkContextFailed": "Failed to load adjacent context", + "rebuildStarted": "Document rebuild started", + "rebuildCompleted": "Document rebuild completed", + "rebuildFailed": "Failed to rebuild document", + "rebuildFailedWithReason": "Failed to rebuild document: {reason}", + "focusChunkLoaded": "Opened the retrieved chunk", + "focusChunkFailed": "Failed to open the retrieved chunk", + "focusChunkNotFound": "Retrieved chunk not found" } } diff --git a/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json b/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json index 67bb4d5717..960edf067c 100644 --- a/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json +++ b/dashboard/src/i18n/locales/en-US/features/knowledge-base/index.json @@ -11,7 +11,9 @@ "documents": "Documents", "chunks": "Chunks", "sessionConfig": "Session Config", - "initError": "Initialization Failed" + "initError": "Initialization Failed", + "noDescription": "No description", + "switchToLegacy": "Switch to legacy knowledge base" }, "card": { "edit": "Edit", @@ -31,9 +33,12 @@ "rerankModelLabel": "Rerank Model (Optional)", "providerInfo": "Provider: {id} | Dimensions: {dimensions}", "rerankProviderInfo": "Provider: {id}", + "nameHint": "If you rename this knowledge base later, update any configuration that still references names.", + "embeddingModelHint": "The embedding model cannot be changed after creation. Create a new knowledge base to use another model.", "cancel": "Cancel", "submit": "Create", - "nameRequired": "Please enter knowledge base name" + "nameRequired": "Please enter knowledge base name", + "embeddingRequired": "Please select an embedding model" }, "edit": { "title": "Edit Knowledge Base", @@ -63,6 +68,7 @@ "updateFailed": "Failed to update", "deleteSuccess": "Knowledge base deleted successfully", "deleteFailed": "Failed to delete", - "loadError": "Failed to load knowledge base list" + "loadError": "Failed to load knowledge base list", + "providersLoadError": "Failed to load model providers" } } diff --git a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json index 5145d5c285..d42d2a3034 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json +++ b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/detail.json @@ -1,121 +1,362 @@ -{ - "title": "Детали базы знаний", - "backToList": "К списку", - "breadcrumb": { - "list": "Базы знаний" +{ + "title": "Детали базы знаний", + "backToList": "К списку", + "breadcrumb": { + "list": "Базы знаний" + }, + "tabs": { + "overview": "Обзор", + "documents": "Документы", + "retrieval": "Поиск", + "sessions": "Сессии", + "settings": "Настройки" + }, + "overview": { + "title": "Информация", + "name": "Название", + "description": "Описание", + "emoji": "Иконка", + "createdAt": "Создана", + "updatedAt": "Обновлена", + "stats": "Статистика", + "docCount": "Количество документов", + "chunkCount": "Количество фрагментов", + "readyDocCount": "Готовые документы", + "failedDocCount": "Ошибки документов", + "sourceFiles": "Исходные файлы", + "storageUsed": "Занято места", + "embeddingModel": "Embedding модель", + "rerankModel": "Rerank модель", + "notSet": "не выбрано" + }, + "consistency": { + "title": "Согласованность индекса", + "run": "Проверить", + "repair": "Исправить доступное", + "notRun": "Проверка еще не запускалась. Запустите ее, чтобы сравнить метаданные документов, исходные файлы и индексированные фрагменты.", + "notRunHint": "Полная проверка читает метаданные индекса и показывает проблемы, которые можно исправить.", + "notRunChunkMismatch": "В текущем снимке {metadata} фрагментов документов, а в индексе {indexed} фрагментов. Запустите проверку.", + "notRunFailedDocs": "Документов с ошибками: {count}. Проверьте список документов или запустите проверку согласованности.", + "healthy": "Проблем согласованности не найдено", + "unhealthy": "Найдено проблем: {count}", + "checkedAt": "Проверено: {time}", + "sqliteDocuments": "Документы в метаданных", + "indexedChunks": "Фрагменты в индексе", + "documentChunks": "Фрагменты документов", + "sourceFiles": "Исходные файлы", + "expectedChunks": "Ожидалось фрагментов: {count}", + "actualChunks": "Фактически фрагментов: {count}", + "checkSuccessHealthy": "Проверка завершена, проблем не найдено", + "checkSuccessUnhealthy": "Проверка завершена, найдено проблем: {count}", + "checkFailed": "Не удалось выполнить проверку", + "repairSuccess": "Исправление завершено: исправлено {repaired}, пропущено {skipped}", + "repairPartialSuccess": "Исправление частично завершено: исправлено {repaired}, пропущено {skipped}, ошибок {failed}", + "repairFailed": "Не удалось исправить согласованность", + "issues": { + "missingVectors": "У документов нет фрагментов в индексе", + "orphanVectors": "Фрагменты без документа", + "missingSourceFiles": "Нет исходных файлов", + "chunkCountMismatches": "Не совпадает число фрагментов", + "invalidVectorMetadata": "Ошибки метаданных индекса", + "unsafeSourcePaths": "Некорректные пути исходных файлов" }, - "tabs": { - "overview": "Обзор", - "documents": "Документы", - "retrieval": "Поиск", - "sessions": "Сессии", - "settings": "Настройки" + "reasons": { + "empty_file_path": "Путь к исходному файлу пуст", + "outside_kb_files_dir": "Путь к исходному файлу вне каталога базы знаний", + "not_found": "Исходный файл не найден" + } + }, + "maintenance": { + "rebuild": "Переиндексировать", + "rebuildStarted": "Переиндексация базы знаний запущена", + "rebuildSuccess": "Переиндексация базы знаний завершена", + "rebuildFailed": "Не удалось переиндексировать базу знаний", + "rebuildFailedWithReason": "Не удалось переиндексировать базу знаний: {reason}", + "rebuildPartialSuccess": "Переиндексация частично завершена: успешно {success}, ошибок {failed}", + "unknownError": "Неизвестная ошибка", + "stages": { + "waiting": "Ожидание...", + "rebuilding": "Переиндексация базы знаний...", + "parsing": "Разбор документа...", + "chunking": "Разбиение текста...", + "embedding": "Генерация векторов...", + "completed": "Завершено" + } + }, + "tasks": { + "title": "Последние задачи", + "refresh": "Обновить задачи", + "empty": "Задач пока нет", + "loadFailed": "Не удалось загрузить последние задачи", + "recentFailures": "Последние ошибки", + "noErrorMessage": "Нет сообщения об ошибке", + "resultSummary": "Всего {total}, успешно {success}, ошибок {failed}", + "progressDetail": "Прогресс {progress}", + "types": { + "upload": "Загрузка документа", + "import": "Импорт документа", + "url": "Импорт URL", + "document_rebuild": "Переиндексация документа", + "document_batch_rebuild": "Пакетная переиндексация документов", + "kb_rebuild": "Переиндексация базы знаний" }, - "overview": { - "title": "Информация", - "name": "Название", - "description": "Описание", - "emoji": "Иконка", - "createdAt": "Создана", - "updatedAt": "Обновлена", - "stats": "Статистика", - "docCount": "Количество документов", - "chunkCount": "Количество фрагментов", - "embeddingModel": "Embedding модель", - "rerankModel": "Rerank модель", - "notSet": "не выбрано" + "statuses": { + "pending": "Ожидание", + "processing": "В обработке", + "completed": "Завершено", + "partial_failed": "Частичная ошибка", + "failed": "Ошибка" + } + }, + "documents": { + "title": "Список документов", + "upload": "Загрузить", + "empty": "Документов нет", + "searchPlaceholder": "Поиск документов...", + "statusFilter": "Статус", + "sourceFilter": "Источник", + "allStatuses": "Все статусы", + "allSources": "Все источники", + "filteredCount": "Показано {filtered} / {total} документов", + "name": "Имя файла", + "type": "Тип", + "status": "Статус", + "size": "Размер", + "chunks": "Фрагменты", + "createdAt": "Дата загрузки", + "actions": "Действия", + "view": "Смотреть", + "copyFailure": "Копировать диагностику", + "rebuild": "Повторить индексацию", + "delete": "Удалить", + "rebuildTitle": "Переиндексировать документ", + "rebuildConfirm": "Переиндексировать документ «{name}»?", + "rebuildWarning": "Переиндексация повторно разберет документ и запишет индекс. До завершения задачи может использоваться прежний индекс.", + "batchRebuild": "Переиндексировать выбранные ({count})", + "batchRebuildTitle": "Переиндексировать выбранные документы", + "batchRebuildConfirm": "Переиндексировать выбранные документы: {count}?", + "batchRebuildMore": "Еще {count}", + "batchRebuildWarning": "Пакетная переиндексация повторно разберет выбранные документы и запишет индексы. До завершения задачи могут использоваться прежние индексы.", + "batchDelete": "Удалить выбранные ({count})", + "batchDeleteTitle": "Удалить выбранные документы", + "batchDeleteConfirm": "Удалить выбранные документы: {count}?", + "batchDeleteMore": "Еще {count}", + "cancel": "Отмена", + "deleteConfirm": "Вы уверены, что хотите удалить «{name}»?", + "deleteWarning": "Это удалит файл и все его фрагменты из индекса.", + "uploading": "Загрузка...", + "uploadSuccess": "Файл успешно загружен", + "uploadFailed": "Ошибка загрузки", + "loadFailed": "Не удалось загрузить документы", + "deleteSuccess": "Файл удален", + "deleteFailed": "Ошибка удаления", + "batchDeleteSuccess": "Удалено документов: {count}", + "batchDeletePartialSuccess": "Пакетное удаление частично завершено: успешно {success}, ошибок {failed}", + "batchDeleteFailed": "Не удалось удалить документы пакетом", + "batchDeleteLimitExceeded": "За один раз можно удалить не более {limit} документов", + "batchRebuildStarted": "Запущена переиндексация документов: {count}", + "batchRebuildFailed": "Не удалось переиндексировать документы пакетом", + "batchRebuildLimitExceeded": "За один раз можно переиндексировать не более {limit} документов", + "failureDocument": "Документ", + "failureDocumentId": "ID документа", + "failureStage": "Этап ошибки", + "failureMessage": "Сообщение ошибки", + "unknownFailureStage": "Неизвестный этап", + "noFailureMessage": "Нет сообщения об ошибке", + "copyFailureSuccess": "Диагностика ошибки скопирована", + "copyFailureFailed": "Не удалось скопировать диагностику ошибки", + "rebuildStarted": "Переиндексация документа запущена", + "rebuildSuccess": "Документ переиндексирован", + "rebuildFailed": "Не удалось переиндексировать документ", + "rebuildFailedWithReason": "Не удалось переиндексировать документ: {reason}", + "rebuildPartialSuccess": "Переиндексация частично завершена: успешно {success}, ошибок {failed}", + "statuses": { + "pending": "Ожидание", + "parsing": "Разбор", + "chunking": "Фрагментация", + "embedding": "Индексация", + "ready": "Готово", + "failed": "Ошибка" }, - "documents": { - "title": "Список документов", - "upload": "Загрузить", - "empty": "Документов нет", - "name": "Имя файла", - "type": "Тип", - "size": "Размер", - "chunks": "Фрагменты", - "createdAt": "Дата загрузки", - "actions": "Действия", - "view": "Смотреть", - "delete": "Удалить", - "deleteConfirm": "Вы уверены, что хотите удалить «{name}»?", - "deleteWarning": "Это удалит файл и все его фрагменты из индекса.", - "uploading": "Загрузка...", - "uploadSuccess": "Файл успешно загружен", - "uploadFailed": "Ошибка загрузки", - "deleteSuccess": "Файл удален", - "deleteFailed": "Ошибка удаления" + "sourceTypes": { + "file": "Файл", + "url": "URL", + "import": "Импорт" + } + }, + "upload": { + "title": "Добавление контента", + "selectFile": "Файл", + "dropzone": "Нажмите или перетащите файл сюда", + "supportedFormats": "Форматы: {formats}", + "maxSize": "Максимум: {size}", + "maxFiles": "Можно загрузить до {count} файлов", + "maxFilesWarning": "Можно выбрать не более {count} файлов", + "selectedFiles": "Выбрано файлов: {count}", + "clear": "Очистить", + "someFilesRejected": "Некоторые файлы не добавлены", + "unsupportedFile": "{name}: неподдерживаемый тип файла", + "fileTooLarge": "{name}: файл больше {size}", + "invalidSettings": "Проверьте параметры загрузки", + "chunkSettings": "Фрагментация", + "batchSettings": "Пакетная обработка", + "cleaningSettings": "Очистка данных", + "enableCleaning": "Включить очистку контента", + "cleaningProvider": "Сервис для очистки", + "cleaningProviderHint": "LLM провайдер для суммаризации и извлечения смыслов из веб-страниц", + "chunkSize": "Размер чанка", + "chunkSizeHint": "Символов в блоке (по умолчанию: {value})", + "chunkOverlap": "Перекрытие", + "chunkOverlapHint": "Перекрытие между блоками (по умолчанию: {value})", + "batchSize": "Размер пакета", + "batchSizeHint": "Блоков за один запрос (по умолчанию: {value})", + "tasksLimit": "Лимит задач", + "tasksLimitHint": "Макс. параллельных потоков (по умолчанию: {value})", + "maxRetries": "Попытки", + "maxRetriesHint": "Повторов при сбое (по умолчанию: {value})", + "cancel": "Отмена", + "submit": "Загрузить", + "fileRequired": "Пожалуйста, выберите файл", + "fileUpload": "Загрузка файла", + "fromUrl": "Из URL", + "urlPlaceholder": "Ссылка на веб-страницу", + "urlRequired": "Введите URL", + "urlHint": "Контент будет автоматически извлечен со страницы. Убедитесь, что сайт разрешает доступ роботам.", + "unsupportedUrlImport": "Импорт из URL не включен на сервере", + "tavilyCheckFailed": "Не удалось проверить настройки веб-поиска", + "tavilyRequired": "Для этой функции нужен Tavily Key", + "configure": "Настроить", + "tavilyConfigured": "Tavily API Key сохранен", + "backgroundUploading": "Фоновая загрузка файлов: {count}...", + "backgroundUrlUploading": "Фоновое извлечение контента из URL...", + "successCount": "Успешно загружено документов: {count}", + "partialSuccess": "Загрузка завершена: успешно {success}, ошибок {failed}", + "failedWithReason": "Ошибка загрузки: {reason}", + "unknownError": "Неизвестная ошибка", + "stages": { + "waiting": "Ожидание...", + "extracting": "Извлечение контента...", + "cleaning": "Очистка контента...", + "parsing": "Разбор документа...", + "chunking": "Разбиение текста...", + "embedding": "Генерация векторов...", + "rebuilding": "Переиндексация документа...", + "completed": "Завершено" }, - "upload": { - "title": "Добавление контента", - "selectFile": "Файл", - "dropzone": "Нажмите или перетащите файл сюда", - "supportedFormats": "Форматы: .txt, .md, .markdown, .rst, .adoc, .pdf, .docx, .epub, .xls, .xlsx", - "maxSize": "Максимум: 128MB", - "chunkSettings": "Фрагментация", - "batchSettings": "Пакетная обработка", - "cleaningSettings": "Очистка данных", - "enableCleaning": "Включить очистку контента", - "cleaningProvider": "Сервис для очистки", - "cleaningProviderHint": "LLM провайдер для суммаризации и извлечения смыслов из веб-страниц", - "chunkSize": "Размер чанка", - "chunkSizeHint": "Символов в блоке (по умолчанию: 512)", - "chunkOverlap": "Перекрытие", - "chunkOverlapHint": "Перекрытие между блоками (по умолчанию: 50)", - "batchSize": "Размер пакета", - "batchSizeHint": "Блоков за один запрос (по умолчанию: 32)", - "tasksLimit": "Лимит задач", - "tasksLimitHint": "Макс. параллельных потоков (по умолчанию: 3)", - "maxRetries": "Попытки", - "maxRetriesHint": "Повторов при сбое (по умолчанию: 3)", - "cancel": "Отмена", - "submit": "Загрузить", - "fileRequired": "Пожалуйста, выберите файл", - "fileUpload": "Загрузка файла", - "fromUrl": "Из URL", - "urlPlaceholder": "Ссылка на веб-страницу", - "urlRequired": "Введите URL", - "urlHint": "Контент будет автоматически извлечен со страницы. Убедитесь, что сайт разрешает доступ роботам.", - "beta": "Бета-версия" + "beta": "Бета-версия" + }, + "retrieval": { + "title": "Поиск и проверка", + "subtitle": "Проверьте качество поиска (Dense & Sparse) по вашей базе знаний", + "query": "Тестовый запрос", + "queryPlaceholder": "Что вы хотите найти?", + "search": "Найти", + "searching": "Ищем...", + "results": "Результаты поиска", + "noResults": "Релевантный контент не найден", + "tryDifferentQuery": "Попробуйте изменить формулировку запроса", + "settings": "Параметры поиска", + "debugMode": "Режим отладки", + "debugModeTsne": "Режим отладки (t-SNE)", + "traceMode": "Трассировка поиска", + "cancel": "Отмена", + "caseNotesPlaceholder": "Например: Sparse поиск дал низкий ранг", + "caseTags": "Теги", + "caseTagsPlaceholder": "Например: manual, retrieval-ui, bad-case", + "tsneVisualization": "t-SNE визуализация", + "topK": "Количество результатов", + "topKHint": "Сколько фрагментов возвращать", + "enableRerank": "Включить Rerank", + "enableRerankHint": "Применить переранжирование для повышения точности", + "score": "Вес (Score)", + "document": "Документ", + "chunk": "Фрагмент #{index}", + "content": "Текст", + "charCount": "{count} симв.", + "traceTitle": "Трассировка поиска", + "traceStageCount": "Этапов: {count}", + "traceHits": "Найдено: {count}", + "traceDenseRank": "Dense ранг #{rank}", + "traceSparseRank": "Sparse ранг #{rank}", + "traceDenseScore": "Оценка dense", + "traceSparseScore": "Оценка sparse", + "traceRrfScore": "Оценка RRF", + "traceRerankScore": "Оценка rerank", + "traceDuplicateOf": "Дубликат {chunk}", + "traceDedupSimilarity": "Сходство дубля {value}", + "sourcePage": "Стр. {page}", + "sourceSection": "Раздел {index}", + "sourceParentChunk": "Родительский фрагмент {id}", + "tracePreviewEmpty": "Нет предпросмотра", + "traceEmpty": "На этом этапе нет кандидатов", + "unknownDocument": "Неизвестный документ", + "traceStages": { + "dense": "Dense поиск", + "sparse": "Sparse поиск", + "fusion": "RRF объединение", + "dedup": "Удаление дублей", + "dedup_removed": "Удаленные дубли", + "rerank": "Rerank", + "final": "Итоговый контекст" }, - "retrieval": { - "title": "Поиск и проверка", - "subtitle": "Проверьте качество поиска (Dense & Sparse) по вашей базе знаний", - "query": "Тестовый запрос", - "queryPlaceholder": "Что вы хотите найти?", - "search": "Найти", - "searching": "Ищем...", - "results": "Результаты поиска", - "noResults": "Релевантный контент не найден", - "tryDifferentQuery": "Попробуйте изменить формулировку запроса", - "settings": "Параметры поиска", - "topK": "Количество результатов", - "topKHint": "Сколько фрагментов возвращать", - "enableRerank": "Включить Rerank", - "enableRerankHint": "Применить переранжирование для повышения точности", - "score": "Вес (Score)", - "document": "Документ", - "chunk": "Фрагмент #{index}", - "content": "Текст", - "charCount": "{count} симв.", - "searchSuccess": "Поиск завершен, найдено: {count}", - "searchFailed": "Ошибка выполнения поиска", - "queryRequired": "Введите поисковый запрос" + "searchSuccess": "Поиск завершен, найдено: {count}", + "searchFailed": "Ошибка выполнения поиска", + "queryRequired": "Введите поисковый запрос", + "latestRunResults": "Последние результаты", + "metricRecall": "Recall", + "metricNdcg": "nDCG", + "metricPrecision": "Precision", + "metricFirstHit": "Первое попадание" + }, + "settings": { + "title": "Общие настройки базы", + "basic": "Основные", + "retrieval": "Поиск", + "chunkSize": "Размер чанка", + "chunkOverlap": "Перекрытие", + "topKDense": "Вернуть (Dense)", + "topKSparse": "Вернуть (Sparse)", + "topMFinal": "Итоговый результат", + "enableRerank": "Включить Rerank", + "embeddingProvider": "Провайдер Embedding", + "rerankProvider": "Провайдер Rerank", + "embeddingProviderHint": "Embedding модель связана с текущим векторным индексом. Для смены создайте новую базу знаний.", + "indexType": "Тип индекса", + "indexTypeHint": "Flat точнее, HNSW лучше для больших баз знаний.", + "indexTypes": { + "flat": "Flat точный индекс", + "hnsw": "HNSW приближенный индекс" }, - "settings": { - "title": "Общие настройки базы", - "basic": "Основные", - "retrieval": "Поиск", - "chunkSize": "Размер чанка", - "chunkOverlap": "Перекрытие", - "topKDense": "Вернуть (Dense)", - "topKSparse": "Вернуть (Sparse)", - "topMFinal": "Итоговый результат", - "enableRerank": "Включить Rerank", - "embeddingProvider": "Провайдер Embedding", - "rerankProvider": "Провайдер Rerank", - "save": "Сохранить", - "saveSuccess": "Настройки сохранены", - "saveFailed": "Ошибка сохранения", - "tips": "Внимание! Изменение этих параметров повлияет на будущую выдачу базы знаний." - } + "save": "Сохранить", + "saveSuccess": "Настройки сохранены", + "saveFailed": "Ошибка сохранения", + "providersLoadFailed": "Не удалось загрузить провайдеры моделей", + "tips": "Внимание! Изменение этих параметров повлияет на будущую выдачу базы знаний." + }, + "validation": { + "integer": "Введите целое число", + "positiveInteger": "Введите целое число больше 0", + "nonNegativeInteger": "Введите целое число не меньше 0", + "overlapLessThanSize": "Перекрытие должно быть меньше размера чанка", + "topKRange": "Количество результатов должно быть целым числом от 1 до {max}" + }, + "actions": { + "retry": "Повторить" + }, + "messages": { + "loadFailed": "Не удалось загрузить детали базы знаний" + }, + "tavily": { + "title": "Настройка Tavily API Key", + "description": "Для веб-функций базы знаний нужен Tavily API Key. Получить его можно на", + "officialSite": "сайте Tavily", + "apiKeyLabel": "Tavily API Key", + "apiKeyPlaceholder": "tvly-...", + "cancel": "Отмена", + "save": "Сохранить", + "keyRequired": "API Key обязателен", + "loadConfigFailed": "Не удалось загрузить текущую конфигурацию", + "saveFailed": "Не удалось сохранить. Проверьте ключ.", + "unknownSaveFailed": "Не удалось сохранить из-за неизвестной ошибки" + } } diff --git a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json index 7fcb30ee9f..4f391e4e93 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json +++ b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/document.json @@ -1,55 +1,124 @@ { - "title": "Просмотр документа", - "backToKB": "К базе знаний", - "info": { - "title": "Информация о документе", - "name": "Имя файла", - "type": "Формат", - "size": "Размер", - "chunkCount": "Количество фрагментов", - "createdAt": "Загружен" + "title": "Просмотр документа", + "backToKB": "К базе знаний", + "info": { + "title": "Информация о документе", + "name": "Имя файла", + "type": "Формат", + "size": "Размер", + "chunkCount": "Количество фрагментов", + "createdAt": "Загружен" + }, + "processing": { + "title": "Информация обработки", + "status": "Статус", + "sourceType": "Тип источника", + "sourceUri": "Источник", + "contentHash": "Хэш контента", + "parser": "Парсер", + "chunker": "Разбиение", + "version": "Версия", + "parentDocId": "ID родительского документа", + "indexedAt": "Индексирован", + "unknownStage": "Неизвестный этап", + "noErrorMessage": "Нет сообщения об ошибке", + "statuses": { + "pending": "Ожидание", + "parsing": "Разбор", + "chunking": "Фрагментация", + "embedding": "Индексация", + "ready": "Готово", + "failed": "Ошибка" }, - "chunks": { - "title": "Фрагменты текста", - "empty": "Фрагменты не найдены", - "index": "Индекс", - "content": "Текст", - "charCount": "Символов", - "actions": "Действия", - "view": "Детали", - "edit": "Изменить", - "delete": "Удалить", - "preview": "Обзор", - "search": "Поиск по документу", - "searchPlaceholder": "Найти во фрагментах...", - "showing": "Показано", - "deleteConfirm": "Удалить этот фрагмент?", - "deleteSuccess": "Фрагмент удален", - "deleteFailed": "Ошибка удаления" - }, - "edit": { - "title": "Редактирование фрагмента", - "content": "Текст", - "cancel": "Отмена", - "save": "Сохранить", - "saveSuccess": "Фрагмент обновлен", - "saveFailed": "Ошибка сохранения" - }, - "delete": { - "title": "Удаление", - "confirmText": "Вы уверены?", - "warning": "Удаление фрагмента может ухудшить качество ответов AI по этой теме.", - "cancel": "Отмена", - "confirm": "Удалить", - "deleteSuccess": "Удаление выполнено", - "deleteFailed": "Ошибка удаления" - }, - "view": { - "title": "Детальный просмотр", - "index": "Индекс", - "content": "Текст", - "charCount": "Символов", - "vecDocId": "ID вектора", - "close": "Закрыть" + "sourceTypes": { + "file": "Файл", + "url": "URL", + "import": "Импорт", + "api": "API" } -} \ No newline at end of file + }, + "chunks": { + "title": "Фрагменты текста", + "total": "Фрагментов: {count}", + "filteredTotal": "Найдено {filtered} / {total} фрагм.", + "empty": "Фрагменты не найдены", + "index": "Индекс", + "content": "Текст", + "titlePath": "Путь заголовков", + "charCount": "Символов", + "charCountValue": "{count} симв.", + "tokenEstimate": "Оценка токенов", + "tokenEstimateValue": "Около {count} ток.", + "offset": "Позиция", + "contentHash": "Хэш контента", + "actions": "Действия", + "view": "Детали", + "edit": "Изменить", + "delete": "Удалить", + "preview": "Обзор", + "search": "Поиск по документу", + "searchPlaceholder": "Найти во фрагментах...", + "showing": "Показано", + "showingRange": "Показано {start} - {end} / {total} фрагм.", + "deleteConfirm": "Удалить этот фрагмент?", + "deleteSuccess": "Фрагмент удален", + "deleteFailed": "Ошибка удаления" + }, + "edit": { + "title": "Редактирование фрагмента", + "content": "Текст", + "cancel": "Отмена", + "save": "Сохранить", + "saveSuccess": "Фрагмент обновлен", + "saveFailed": "Ошибка сохранения" + }, + "delete": { + "title": "Удаление", + "confirmText": "Вы уверены?", + "warning": "Удаление фрагмента может ухудшить качество ответов AI по этой теме.", + "cancel": "Отмена", + "confirm": "Удалить", + "deleteSuccess": "Удаление выполнено", + "deleteFailed": "Ошибка удаления" + }, + "view": { + "title": "Детальный просмотр", + "index": "Индекс", + "content": "Текст", + "charCount": "Символов", + "tokenEstimate": "Оценка токенов", + "titlePath": "Путь заголовков", + "section": "Раздел", + "pageNumber": "Страница", + "offset": "Позиция", + "contentHash": "Хэш контента", + "adjacentChunks": "Соседние фрагменты", + "previousChunk": "Предыдущий: {id}", + "nextChunk": "Следующий: {id}", + "parentChunk": "Родительский фрагмент", + "vecDocId": "ID вектора", + "context": "Соседний контекст", + "previous": "Предыдущий", + "current": "Текущий", + "next": "Следующий", + "contextMissing": "Соседний фрагмент отсутствует", + "close": "Закрыть" + }, + "actions": { + "retry": "Повторить", + "retryRebuild": "Повторить индексацию", + "retryRebuildConfirm": "Переиндексировать этот документ?" + }, + "messages": { + "loadDocumentFailed": "Не удалось загрузить документ", + "loadChunksFailed": "Не удалось загрузить фрагменты", + "loadChunkContextFailed": "Не удалось загрузить соседний контекст", + "rebuildStarted": "Переиндексация документа запущена", + "rebuildCompleted": "Переиндексация документа завершена", + "rebuildFailed": "Не удалось переиндексировать документ", + "rebuildFailedWithReason": "Не удалось переиндексировать документ: {reason}", + "focusChunkLoaded": "Открыт найденный фрагмент", + "focusChunkFailed": "Не удалось открыть найденный фрагмент", + "focusChunkNotFound": "Найденный фрагмент не найден" + } +} diff --git a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json index 4eb99d5f06..ca7f5e26ed 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json +++ b/dashboard/src/i18n/locales/ru-RU/features/knowledge-base/index.json @@ -1,68 +1,74 @@ { - "title": "Управление базами знаний", - "subtitle": "Централизованное управление всеми знаниями AstrBot", - "list": { - "title": "Базы знаний", - "subtitle": "Все доступные коллекции знаний", - "create": "Создать базу", - "refresh": "Обновить", - "empty": "Баз знаний пока нет", - "loading": "Загрузка...", - "documents": "док.", - "chunks": "фрагм.", - "sessionConfig": "Профиль", - "initError": "Ошибка инициализации" - }, - "card": { - "edit": "Изменить", - "delete": "Удалить", - "open": "Открыть", - "docCount": "Документов: {count}", - "chunkCount": "Фрагментов: {count}" - }, - "create": { - "title": "Создание базы знаний", - "nameLabel": "Название", - "namePlaceholder": "Придумайте имя для базы", - "descriptionLabel": "Описание", - "descriptionPlaceholder": "Для чего нужна эта база?", - "emojiLabel": "Иконка", - "embeddingModelLabel": "Embedding модель", - "rerankModelLabel": "Rerank модель (опционально)", - "providerInfo": "Провайдер: {id} | Размерность: {dimensions}", - "rerankProviderInfo": "Провайдер: {id}", - "cancel": "Отмена", - "submit": "Создать", - "nameRequired": "Введите название базы знаний" - }, - "edit": { - "title": "Редактирование", - "submit": "Сохранить" - }, - "delete": { - "title": "Удаление", - "confirmText": "Вы уверены, что хотите удалить базу знаний «{name}»?", - "warning": "Это действие необратимо. Все документы, фрагменты и настройки будут навсегда удалены.", - "cancel": "Отмена", - "confirm": "Удалить" - }, - "emoji": { - "title": "Выберите иконку", - "close": "Закрыть", - "categories": { - "books": "Книги и документы", - "emotions": "Эмоции", - "objects": "Вещи", - "symbols": "Символы" - } - }, - "messages": { - "createSuccess": "База знаний создана", - "createFailed": "Ошибка создания", - "updateSuccess": "Обновлено успешно", - "updateFailed": "Ошибка обновления", - "deleteSuccess": "Удалено успешно", - "deleteFailed": "Ошибка удаления", - "loadError": "Не удалось загрузить список" + "title": "Управление базами знаний", + "subtitle": "Централизованное управление всеми знаниями AstrBot", + "list": { + "title": "Базы знаний", + "subtitle": "Все доступные коллекции знаний", + "create": "Создать базу", + "refresh": "Обновить", + "empty": "Баз знаний пока нет", + "loading": "Загрузка...", + "documents": "док.", + "chunks": "фрагм.", + "sessionConfig": "Профиль", + "initError": "Ошибка инициализации", + "noDescription": "Нет описания", + "switchToLegacy": "Перейти к старой базе знаний" + }, + "card": { + "edit": "Изменить", + "delete": "Удалить", + "open": "Открыть", + "docCount": "Документов: {count}", + "chunkCount": "Фрагментов: {count}" + }, + "create": { + "title": "Создание базы знаний", + "nameLabel": "Название", + "namePlaceholder": "Придумайте имя для базы", + "descriptionLabel": "Описание", + "descriptionPlaceholder": "Для чего нужна эта база?", + "emojiLabel": "Иконка", + "embeddingModelLabel": "Embedding модель", + "rerankModelLabel": "Rerank модель (опционально)", + "providerInfo": "Провайдер: {id} | Размерность: {dimensions}", + "rerankProviderInfo": "Провайдер: {id}", + "nameHint": "Если позже переименуете базу, обновите конфигурации, где она указана по имени.", + "embeddingModelHint": "Embedding модель нельзя изменить после создания. Для другой модели создайте новую базу.", + "cancel": "Отмена", + "submit": "Создать", + "nameRequired": "Введите название базы знаний", + "embeddingRequired": "Выберите embedding модель" + }, + "edit": { + "title": "Редактирование", + "submit": "Сохранить" + }, + "delete": { + "title": "Удаление", + "confirmText": "Вы уверены, что хотите удалить базу знаний «{name}»?", + "warning": "Это действие необратимо. Все документы, фрагменты и настройки будут навсегда удалены.", + "cancel": "Отмена", + "confirm": "Удалить" + }, + "emoji": { + "title": "Выберите иконку", + "close": "Закрыть", + "categories": { + "books": "Книги и документы", + "emotions": "Эмоции", + "objects": "Вещи", + "symbols": "Символы" } + }, + "messages": { + "createSuccess": "База знаний создана", + "createFailed": "Ошибка создания", + "updateSuccess": "Обновлено успешно", + "updateFailed": "Ошибка обновления", + "deleteSuccess": "Удалено успешно", + "deleteFailed": "Ошибка удаления", + "loadError": "Не удалось загрузить список", + "providersLoadError": "Не удалось загрузить провайдеры моделей" + } } diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json index 54bc60b7a7..987e91fa18 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json +++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/detail.json @@ -21,36 +21,183 @@ "stats": "统计信息", "docCount": "文档数量", "chunkCount": "分块数量", + "readyDocCount": "已索引文档", + "failedDocCount": "失败文档", + "sourceFiles": "源文件", + "storageUsed": "存储占用", "embeddingModel": "嵌入模型", "rerankModel": "重排序模型", "notSet": "未设置" }, + "consistency": { + "title": "索引一致性", + "run": "运行检查", + "repair": "修复可修复项", + "notRun": "尚未运行一致性检查。点击运行检查可诊断文档元数据、源文件和索引文本块是否一致。", + "notRunHint": "完整检查会读取索引元数据,并列出可修复项。", + "notRunChunkMismatch": "当前快照显示文档记录有 {metadata} 个分块,索引中有 {indexed} 个分块,建议运行检查。", + "notRunFailedDocs": "当前有 {count} 个失败文档,建议查看文档列表或运行一致性检查。", + "healthy": "未发现一致性问题", + "unhealthy": "发现 {count} 个一致性问题", + "checkedAt": "检查时间: {time}", + "sqliteDocuments": "元数据文档", + "indexedChunks": "索引分块", + "documentChunks": "文档分块", + "sourceFiles": "源文件", + "expectedChunks": "预期 {count} 个分块", + "actualChunks": "实际 {count} 个分块", + "checkSuccessHealthy": "一致性检查完成,未发现问题", + "checkSuccessUnhealthy": "一致性检查完成,发现 {count} 个问题", + "checkFailed": "一致性检查失败", + "repairSuccess": "一致性修复完成: 修复 {repaired} 项, 跳过 {skipped} 项", + "repairPartialSuccess": "一致性修复部分完成: 修复 {repaired} 项, 跳过 {skipped} 项, 失败 {failed} 项", + "repairFailed": "一致性修复失败", + "issues": { + "missingVectors": "文档缺失索引分块", + "orphanVectors": "孤儿索引分块", + "missingSourceFiles": "源文件缺失", + "chunkCountMismatches": "分块数量不一致", + "invalidVectorMetadata": "索引元数据异常", + "unsafeSourcePaths": "源文件路径异常" + }, + "reasons": { + "empty_file_path": "源文件路径为空", + "outside_kb_files_dir": "源文件路径不在知识库目录内", + "not_found": "源文件不存在" + } + }, + "maintenance": { + "rebuild": "重建索引", + "rebuildStarted": "知识库重建任务已开始", + "rebuildSuccess": "知识库重建完成", + "rebuildFailed": "知识库重建失败", + "rebuildFailedWithReason": "知识库重建失败: {reason}", + "rebuildPartialSuccess": "知识库重建部分完成: 成功 {success} 个, 失败 {failed} 个", + "unknownError": "未知错误", + "stages": { + "waiting": "等待中...", + "rebuilding": "重建知识库...", + "parsing": "解析文档...", + "chunking": "文本分块...", + "embedding": "生成向量...", + "completed": "已完成" + } + }, + "tasks": { + "title": "最近任务", + "refresh": "刷新任务", + "empty": "暂无任务记录", + "loadFailed": "加载最近任务失败", + "recentFailures": "最近失败", + "noErrorMessage": "暂无错误信息", + "resultSummary": "共 {total} 个,成功 {success} 个,失败 {failed} 个", + "progressDetail": "进度 {progress}", + "types": { + "upload": "上传文档", + "import": "导入文档", + "url": "URL 导入", + "document_rebuild": "文档重建", + "document_batch_rebuild": "批量文档重建", + "kb_rebuild": "知识库重建" + }, + "statuses": { + "pending": "等待中", + "processing": "处理中", + "completed": "已完成", + "partial_failed": "部分失败", + "failed": "失败" + } + }, "documents": { "title": "文档列表", "upload": "上传文档", "empty": "暂无文档", + "searchPlaceholder": "搜索文档...", + "statusFilter": "状态", + "sourceFilter": "来源", + "allStatuses": "全部状态", + "allSources": "全部来源", + "filteredCount": "显示 {filtered} / {total} 个文档", "name": "文档名称", "type": "类型", + "status": "状态", "size": "大小", "chunks": "分块数", "createdAt": "上传时间", "actions": "操作", "view": "查看", + "copyFailure": "复制失败诊断", + "rebuild": "重试重建", "delete": "删除", + "rebuildTitle": "重建文档索引", + "rebuildConfirm": "确定要重新构建文档「{name}」的索引吗?", + "rebuildWarning": "重建会重新解析并写入索引。任务完成前,旧索引仍可能被检索到。", + "batchRebuild": "批量重建 ({count})", + "batchRebuildTitle": "批量重建文档索引", + "batchRebuildConfirm": "确定要重新构建选中的 {count} 个文档索引吗?", + "batchRebuildMore": "还有 {count} 个", + "batchRebuildWarning": "批量重建会为选中文档重新解析并写入索引。任务完成前,旧索引仍可能被检索到。", + "batchDelete": "批量删除 ({count})", + "batchDeleteTitle": "批量删除文档", + "batchDeleteConfirm": "确定要删除选中的 {count} 个文档吗?", + "batchDeleteMore": "还有 {count} 个", + "cancel": "取消", "deleteConfirm": "确定要删除文档「{name}」吗?", "deleteWarning": "此操作将删除文档及其所有分块,不可恢复。", "uploading": "正在上传...", "uploadSuccess": "文档上传成功", "uploadFailed": "文档上传失败", + "loadFailed": "加载文档列表失败", "deleteSuccess": "文档删除成功", - "deleteFailed": "文档删除失败" + "deleteFailed": "文档删除失败", + "batchDeleteSuccess": "已删除 {count} 个文档", + "batchDeletePartialSuccess": "批量删除部分完成: 成功 {success} 个, 失败 {failed} 个", + "batchDeleteFailed": "批量删除文档失败", + "batchDeleteLimitExceeded": "单次最多只能删除 {limit} 个文档", + "batchRebuildStarted": "已开始重建 {count} 个文档", + "batchRebuildFailed": "批量重建文档失败", + "batchRebuildLimitExceeded": "单次最多只能重建 {limit} 个文档", + "failureDocument": "文档", + "failureDocumentId": "文档 ID", + "failureStage": "失败阶段", + "failureMessage": "错误信息", + "unknownFailureStage": "未知阶段", + "noFailureMessage": "暂无错误信息", + "copyFailureSuccess": "已复制失败诊断信息", + "copyFailureFailed": "复制失败诊断信息失败", + "rebuildStarted": "文档重建任务已开始", + "rebuildSuccess": "文档重建成功", + "rebuildFailed": "文档重建失败", + "rebuildFailedWithReason": "文档重建失败: {reason}", + "rebuildPartialSuccess": "文档重建部分成功: 成功 {success} 个, 失败 {failed} 个", + "statuses": { + "pending": "等待中", + "parsing": "解析中", + "chunking": "分块中", + "embedding": "索引中", + "ready": "已索引", + "failed": "失败" + }, + "sourceTypes": { + "file": "文件", + "url": "URL", + "import": "导入" + } }, "upload": { "title": "上传文档", "selectFile": "选择文件", "dropzone": "拖放文件到这里或点击选择", - "supportedFormats": "支持的格式: .txt, .md, .markdown, .rst, .adoc, .pdf, .docx, .epub, .xls, .xlsx", - "maxSize": "最大文件大小: 128MB", + "supportedFormats": "支持的格式: {formats}", + "maxSize": "最大文件大小: {size}", + "maxFiles": "最多可上传 {count} 个文件", + "maxFilesWarning": "最多只能选择 {count} 个文件", + "selectedFiles": "已选择 {count} 个文件", + "clear": "清空", + "someFilesRejected": "部分文件未加入上传队列", + "unsupportedFile": "{name}: 不支持的文件类型", + "fileTooLarge": "{name}: 文件超过 {size}", + "invalidSettings": "请检查上传参数", "chunkSettings": "分块设置", "batchSettings": "批处理设置", "cleaningSettings": "清洗设置", @@ -58,15 +205,15 @@ "cleaningProvider": "清洗服务提供商", "cleaningProviderHint": "选择一个 LLM 服务商来对提取的网页内容进行清洗和总结", "chunkSize": "分块大小", - "chunkSizeHint": "每个文本块的字符数 (默认: 512)", + "chunkSizeHint": "每个文本块的字符数 (默认: {value})", "chunkOverlap": "分块重叠", - "chunkOverlapHint": "相邻文本块之间的重叠字符数 (默认: 50)", + "chunkOverlapHint": "相邻文本块之间的重叠字符数 (默认: {value})", "batchSize": "批处理大小", - "batchSizeHint": "每批处理的文本块数量 (默认: 32)", + "batchSizeHint": "每批处理的文本块数量 (默认: {value})", "tasksLimit": "并发任务限制", - "tasksLimitHint": "最大并发上传任务数 (默认: 3)", + "tasksLimitHint": "最大并发上传任务数 (默认: {value})", "maxRetries": "最大重试次数", - "maxRetriesHint": "上传失败任务的重试次数 (默认: 3)", + "maxRetriesHint": "上传失败任务的重试次数 (默认: {value})", "cancel": "取消", "submit": "上传", "fileRequired": "请选择要上传的文件", @@ -75,6 +222,27 @@ "urlPlaceholder": "请输入要提取内容的网页 URL", "urlRequired": "请输入 URL", "urlHint": "将自动从目标 URL 提取主要内容作为文档。目前支持 {supported} 页面,请确保目标网页允许爬虫访问。", + "unsupportedUrlImport": "当前后端未启用 URL 导入功能", + "tavilyCheckFailed": "检查网页搜索配置失败", + "tavilyRequired": "使用此功能需要配置 Tavily Key", + "configure": "配置", + "tavilyConfigured": "Tavily API Key 配置成功", + "backgroundUploading": "正在后台上传 {count} 个文件...", + "backgroundUrlUploading": "正在从 URL 后台提取内容...", + "successCount": "成功上传 {count} 个文档", + "partialSuccess": "上传完成: {success} 个成功, {failed} 个失败", + "failedWithReason": "上传失败: {reason}", + "unknownError": "未知错误", + "stages": { + "waiting": "等待中...", + "extracting": "提取内容...", + "cleaning": "清洗内容...", + "parsing": "解析文档...", + "chunking": "文本分块...", + "embedding": "生成向量...", + "rebuilding": "重建文档...", + "completed": "已完成" + }, "beta": "测试版" }, "retrieval": { @@ -88,6 +256,14 @@ "noResults": "没有找到相关内容", "tryDifferentQuery": "尝试使用不同的查询词", "settings": "检索设置", + "debugMode": "调试模式", + "debugModeTsne": "调试模式 (t-SNE)", + "traceMode": "检索链路追踪", + "cancel": "取消", + "caseNotesPlaceholder": "例如:稀疏检索排名偏低", + "caseTags": "标签", + "caseTagsPlaceholder": "例如:manual, retrieval-ui, bad-case", + "tsneVisualization": "t-SNE 可视化", "topK": "返回结果数量", "topKHint": "最多返回多少条检索结果", "enableRerank": "启用重排序", @@ -97,9 +273,40 @@ "chunk": "文本块 #{index}", "content": "内容", "charCount": "{count} 字符", + "traceTitle": "检索链路", + "traceStageCount": "{count} 个阶段", + "traceHits": "{count} 条", + "traceDenseRank": "稠密排名 #{rank}", + "traceSparseRank": "稀疏排名 #{rank}", + "traceDenseScore": "稠密分", + "traceSparseScore": "稀疏分", + "traceRrfScore": "RRF 分", + "traceRerankScore": "重排分", + "traceDuplicateOf": "重复于 {chunk}", + "traceDedupSimilarity": "重复相似度 {value}", + "sourcePage": "第 {page} 页", + "sourceSection": "章节 {index}", + "sourceParentChunk": "父文本块 {id}", + "tracePreviewEmpty": "暂无内容预览", + "traceEmpty": "该阶段没有候选结果", + "unknownDocument": "未知文档", + "traceStages": { + "dense": "稠密召回", + "sparse": "稀疏召回", + "fusion": "RRF 融合", + "dedup": "近重复去除", + "dedup_removed": "已移除重复项", + "rerank": "重排序", + "final": "最终上下文" + }, "searchSuccess": "检索完成,找到 {count} 条结果", "searchFailed": "检索失败", - "queryRequired": "请输入检索查询" + "queryRequired": "请输入检索查询", + "latestRunResults": "最近结果", + "metricRecall": "召回率", + "metricNdcg": "归一化折损累计增益 (nDCG)", + "metricPrecision": "精确率", + "metricFirstHit": "首个命中" }, "settings": { "title": "知识库设置", @@ -113,9 +320,43 @@ "enableRerank": "启用重排序", "embeddingProvider": "嵌入模型提供商", "rerankProvider": "重排序模型提供商", + "embeddingProviderHint": "嵌入模型与现有向量索引绑定,如需更换请创建新的知识库。", + "indexType": "索引类型", + "indexTypeHint": "Flat 更精确,HNSW 更适合大规模知识库。", + "indexTypes": { + "flat": "Flat 精确索引", + "hnsw": "HNSW 近似索引" + }, "save": "保存设置", "saveSuccess": "设置保存成功", "saveFailed": "设置保存失败", + "providersLoadFailed": "加载模型提供商失败", "tips": "提示: 修改检索设置后,将影响后续的知识库查询效果。" + }, + "validation": { + "integer": "请输入整数", + "positiveInteger": "请输入大于 0 的整数", + "nonNegativeInteger": "请输入不小于 0 的整数", + "overlapLessThanSize": "分块重叠必须小于分块大小", + "topKRange": "返回结果数量必须是 1 到 {max} 的整数" + }, + "actions": { + "retry": "重试" + }, + "messages": { + "loadFailed": "加载知识库详情失败" + }, + "tavily": { + "title": "配置 Tavily API Key", + "description": "为了使用基于网页的知识库功能,需要提供 Tavily API Key。您可以从", + "officialSite": "Tavily 官网", + "apiKeyLabel": "Tavily API Key", + "apiKeyPlaceholder": "tvly-...", + "cancel": "取消", + "save": "保存", + "keyRequired": "API Key 不能为空", + "loadConfigFailed": "获取当前配置失败", + "saveFailed": "保存失败,请检查 Key 是否正确", + "unknownSaveFailed": "保存失败,发生未知错误" } } diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json index c90c29cc29..6127213d92 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json +++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/document.json @@ -9,12 +9,48 @@ "chunkCount": "分块数量", "createdAt": "上传时间" }, + "processing": { + "title": "处理信息", + "status": "状态", + "sourceType": "来源类型", + "sourceUri": "来源地址", + "contentHash": "内容哈希", + "parser": "解析器", + "chunker": "分块器", + "version": "版本", + "parentDocId": "父文档 ID", + "indexedAt": "索引时间", + "unknownStage": "未知阶段", + "noErrorMessage": "暂无错误信息", + "statuses": { + "pending": "等待中", + "parsing": "解析中", + "chunking": "分块中", + "embedding": "索引中", + "ready": "已索引", + "failed": "失败" + }, + "sourceTypes": { + "file": "文件", + "url": "URL", + "import": "导入", + "api": "API" + } + }, "chunks": { "title": "分块列表", + "total": "{count} 个分块", + "filteredTotal": "匹配 {filtered} / {total} 个分块", "empty": "暂无分块", "index": "序号", "content": "内容", + "titlePath": "标题路径", "charCount": "字符数", + "charCountValue": "{count} 字符", + "tokenEstimate": "估算 Token", + "tokenEstimateValue": "约 {count} token", + "offset": "位置", + "contentHash": "内容哈希", "actions": "操作", "view": "查看", "edit": "编辑", @@ -23,6 +59,7 @@ "search": "搜索分块", "searchPlaceholder": "输入关键词搜索分块内容...", "showing": "显示", + "showingRange": "显示 {start} - {end} / {total} 个分块", "deleteConfirm": "确定要删除该文本块吗?", "deleteSuccess": "文本块删除成功", "deleteFailed": "文本块删除失败" @@ -49,7 +86,39 @@ "index": "序号", "content": "内容", "charCount": "字符数", + "tokenEstimate": "估算 Token", + "titlePath": "标题路径", + "section": "章节", + "pageNumber": "页码", + "offset": "位置", + "contentHash": "内容哈希", + "adjacentChunks": "相邻分块", + "previousChunk": "上一块: {id}", + "nextChunk": "下一块: {id}", + "parentChunk": "父分块", "vecDocId": "向量ID", + "context": "相邻上下文", + "previous": "上一块", + "current": "当前块", + "next": "下一块", + "contextMissing": "暂无相邻分块", "close": "关闭" + }, + "actions": { + "retry": "重试", + "retryRebuild": "重试重建", + "retryRebuildConfirm": "确定要重新构建该文档索引吗?" + }, + "messages": { + "loadDocumentFailed": "加载文档详情失败", + "loadChunksFailed": "加载分块列表失败", + "loadChunkContextFailed": "加载相邻上下文失败", + "rebuildStarted": "文档重建任务已开始", + "rebuildCompleted": "文档重建完成", + "rebuildFailed": "文档重建失败", + "rebuildFailedWithReason": "文档重建失败: {reason}", + "focusChunkLoaded": "已打开检索命中的分块", + "focusChunkFailed": "打开检索命中的分块失败", + "focusChunkNotFound": "未找到检索命中的分块" } } diff --git a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json index cac88bacd1..87d74926db 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json +++ b/dashboard/src/i18n/locales/zh-CN/features/knowledge-base/index.json @@ -11,7 +11,9 @@ "documents": "文档", "chunks": "分块", "sessionConfig": "会话配置", - "initError": "初始化失败" + "initError": "初始化失败", + "noDescription": "暂无描述", + "switchToLegacy": "切换到旧版知识库" }, "card": { "edit": "编辑", @@ -27,13 +29,16 @@ "descriptionLabel": "描述", "descriptionPlaceholder": "简单描述这个知识库的用途...", "emojiLabel": "图标", - "embeddingModelLabel": "嵌入模型 (Embedding Model)", - "rerankModelLabel": "重排序模型 (Rerank Model, 可选)", + "embeddingModelLabel": "嵌入模型", + "rerankModelLabel": "重排序模型(可选)", "providerInfo": "提供商: {id} | 维度: {dimensions}", "rerankProviderInfo": "提供商: {id}", + "nameHint": "如果后续修改知识库名称,请同步更新仍按名称引用的配置。", + "embeddingModelHint": "嵌入模型选择后无法修改,如需更换请创建新的知识库。", "cancel": "取消", "submit": "创建", - "nameRequired": "请输入知识库名称" + "nameRequired": "请输入知识库名称", + "embeddingRequired": "请选择嵌入模型" }, "edit": { "title": "编辑知识库", @@ -63,6 +68,7 @@ "updateFailed": "更新失败", "deleteSuccess": "知识库删除成功", "deleteFailed": "删除失败", - "loadError": "加载知识库列表失败" + "loadError": "加载知识库列表失败", + "providersLoadError": "加载模型提供商失败" } } diff --git a/dashboard/src/main.ts b/dashboard/src/main.ts index ce5514207c..eb2f15c205 100644 --- a/dashboard/src/main.ts +++ b/dashboard/src/main.ts @@ -2,7 +2,7 @@ import { createApp } from 'vue'; import { createPinia } from 'pinia'; import App from './App.vue'; import { router } from './router'; -import vuetify from './plugins/vuetify'; +import vuetify, { getVuetifyLocale } from './plugins/vuetify'; import confirmPlugin from './plugins/confirmPlugin'; import { setupI18n } from './i18n/composables'; import '@/scss/style.scss'; @@ -47,12 +47,18 @@ import { waitForRouterReadyInBackground } from './utils/routerReadiness.mjs'; }, }; +const syncVuetifyLocale = (event: Event) => { + const locale = (event as CustomEvent<{ locale?: string }>).detail?.locale; + vuetify.locale.current.value = getVuetifyLocale(locale); +}; + // 初始化新的i18n系统,等待完成后再挂载应用 setupI18n().then(async () => { console.log('🌍 新i18n系统初始化完成'); - + const app = createApp(App); const pinia = createPinia(); + window.addEventListener('astrbot-locale-changed', syncVuetifyLocale); app.use(pinia); app.use(router); app.use(print); @@ -86,6 +92,7 @@ setupI18n().then(async () => { // 即使i18n初始化失败,也要挂载应用(使用回退机制) const app = createApp(App); const pinia = createPinia(); + window.addEventListener('astrbot-locale-changed', syncVuetifyLocale); app.use(pinia); app.use(router); app.use(print); diff --git a/dashboard/src/plugins/vuetify.ts b/dashboard/src/plugins/vuetify.ts index e38fd388e6..474f1ca02c 100644 --- a/dashboard/src/plugins/vuetify.ts +++ b/dashboard/src/plugins/vuetify.ts @@ -1,32 +1,91 @@ import { createVuetify } from 'vuetify'; +import { en, ru, zhHans } from 'vuetify/locale'; import '@/assets/mdi-subset/materialdesignicons-subset.css'; import * as components from 'vuetify/components'; import * as directives from 'vuetify/directives'; import { PurpleTheme } from '@/theme/LightTheme'; -import { PurpleThemeDark } from "@/theme/DarkTheme"; +import { PurpleThemeDark } from '@/theme/DarkTheme'; + +const zhHansMessages = { + ...zhHans, + open: '打开', + dismiss: '关闭', + dataFooter: { + ...zhHans.dataFooter, + itemsPerPageText: '每页条数:', + firstPage: '第一页', + lastPage: '最后一页', + }, + input: { + ...zhHans.input, + clear: '清空 {0}', + prependAction: '{0} 前置操作', + appendAction: '{0} 后置操作', + otp: '请输入第 {0} 位验证码', + }, + pagination: { + ...zhHans.pagination, + ariaLabel: { + ...zhHans.pagination.ariaLabel, + first: '第一页', + last: '最后一页', + }, + }, + stepper: { + next: '下一步', + prev: '上一步', + }, + loading: '加载中...', +}; + +const vuetifyLocaleMap: Record = { + 'zh-CN': 'zhHans', + 'en-US': 'en', + 'ru-RU': 'ru', +}; + +export const getVuetifyLocale = (locale?: string | null) => { + if (!locale) { + return 'zhHans'; + } + return vuetifyLocaleMap[locale] || 'zhHans'; +}; export default createVuetify({ components, directives, + locale: { + locale: getVuetifyLocale( + typeof localStorage === 'undefined' + ? null + : localStorage.getItem('astrbot-locale'), + ), + fallback: 'en', + messages: { + en, + ru, + zhHans: zhHansMessages, + }, + }, theme: { defaultTheme: 'PurpleTheme', themes: { PurpleTheme, - PurpleThemeDark - } + PurpleThemeDark, + }, }, defaults: { VBtn: {}, VCard: { - rounded: 'lg' + rounded: 'lg', }, VTextField: { - rounded: 'lg' + rounded: 'lg', }, VTooltip: { // set v-tooltip default location to top - location: 'top' - } - } + location: 'top', + }, + }, }); diff --git a/dashboard/src/views/knowledge-base/DocumentDetail.vue b/dashboard/src/views/knowledge-base/DocumentDetail.vue index 921315e627..212c848e09 100644 --- a/dashboard/src/views/knowledge-base/DocumentDetail.vue +++ b/dashboard/src/views/knowledge-base/DocumentDetail.vue @@ -9,7 +9,9 @@ />

{{ document.doc_name }}

-

{{ t('title') }}

+

+ {{ t("title") }} +

@@ -18,18 +20,29 @@ + +
+ {{ loadError }} + + {{ t("actions.retry") }} + +
+
+
- {{ t('info.title') }} + {{ t("info.title") }}
mdi-label
-
{{ t('info.name') }}
+
+ {{ t("info.name") }} +
{{ document.doc_name }}
@@ -40,8 +53,10 @@ {{ getFileIcon(document.file_type) }}
-
{{ t('info.type') }}
-
{{ document.file_type || '-' }}
+
+ {{ t("info.type") }} +
+
{{ document.file_type || "-" }}
@@ -49,8 +64,12 @@
mdi-file-chart
-
{{ t('info.size') }}
-
{{ formatFileSize(document.file_size) }}
+
+ {{ t("info.size") }} +
+
+ {{ formatFileSize(document.file_size) }} +
@@ -58,7 +77,9 @@
mdi-text-box
-
{{ t('info.chunkCount') }}
+
+ {{ t("info.chunkCount") }} +
{{ document.chunk_count || 0 }}
@@ -67,8 +88,12 @@
mdi-calendar
-
{{ t('info.createdAt') }}
-
{{ formatDate(document.created_at) }}
+
+ {{ t("info.createdAt") }} +
+
+ {{ formatDate(document.created_at) }} +
@@ -76,88 +101,325 @@ + + {{ t("processing.title") }} + + + +
+ + {{ getDocumentStatusIcon(document.status) }} + +
+
+ {{ t("processing.status") }} +
+ + {{ getDocumentStatusText(document.status) }} + +
+
+
+ +
+ mdi-source-branch +
+
+ {{ t("processing.sourceType") }} +
+
+ {{ getSourceTypeText(document.source_type) }} +
+
+
+
+ +
+ mdi-counter +
+
+ {{ t("processing.version") }} +
+
+ {{ document.version || 1 }} +
+
+
+
+ +
+ mdi-calendar-check +
+
+ {{ t("processing.indexedAt") }} +
+
+ {{ formatDate(document.indexed_at) }} +
+
+
+
+ +
+ mdi-link-variant + +
+
+ +
+ mdi-fingerprint + +
+
+ +
+ mdi-file-cog-outline + +
+
+ +
+ mdi-text-box-check-outline + +
+
+ +
+ mdi-file-replace-outline + +
+
+
+ +
+ + + {{ t("actions.retryRebuild") }} + +
+
+
+
+ - - {{ t('chunks.title') }} - - {{ totalChunks }} {{ t('chunks.title') }} - - - + /> - - - - +
+ + - + - + - - - + + + + + + + + + + + + +
-
+
- {{ t('chunks.showing') }} {{ (page - 1) * pageSize + 1 }} - {{ Math.min(page * pageSize, totalChunks) }} / {{ totalChunks }} + {{ + t("chunks.showingRange", { + start: (page - 1) * pageSize + 1, + end: Math.min(page * pageSize, totalChunks), + total: totalChunks, + }) + }}
-
+
- - - - {{ t('view.title') }} + + + + {{ t("view.title") }} - + - - - - - {{ t('view.index') }} - #{{ (selectedChunk?.chunk_index || 0) + 1 }} - - - - - {{ t('view.charCount') }} - {{ selectedChunk?.char_count || 0 }} 字符 - - - - - {{ t('view.vecDocId') }} - {{ selectedChunk?.chunk_id || '-' }} - - + +
+
+ + {{ field.icon }} + +
+
{{ field.label }}
+
+ {{ field.value }} +
+
+
+
-
{{ t('view.content') }}
+
+ {{ t("view.content") }} +
{{ selectedChunk?.content }}
+ +
+
+ {{ t("view.context") }} +
+ + +
+
+
+
+ + {{ slot.label }} + + + {{ formatContextMeta(slot.chunk) }} + +
+
+ {{ slot.chunk?.content || t("view.contextMissing") }} +
+
+
- {{ t('view.close') }} + {{ t("view.close") }}
@@ -233,197 +539,736 @@ diff --git a/dashboard/src/views/knowledge-base/KBDetail.vue b/dashboard/src/views/knowledge-base/KBDetail.vue index b1570bad48..2bd1834b84 100644 --- a/dashboard/src/views/knowledge-base/KBDetail.vue +++ b/dashboard/src/views/knowledge-base/KBDetail.vue @@ -5,129 +5,528 @@
+ +
+ {{ loadError }} + + {{ t("actions.retry") }} + +
+
+
mdi-information-outline - {{ t('tabs.overview') }} + {{ t("tabs.overview") }} mdi-file-document-multiple - {{ t('tabs.documents') }} - {{ kb.doc_count || 0 }} + {{ t("tabs.documents") }} + {{ + documentCount + }} mdi-magnify - {{ t('tabs.retrieval') }} + {{ t("tabs.retrieval") }} mdi-cog - {{ t('tabs.settings') }} + {{ t("tabs.settings") }} - + - - - - {{ t('overview.title') }} + + + + {{ t("overview.title") }} - {{ t('overview.name') }} - {{ kb.kb_name }} + {{ + t("overview.name") + }} + {{ + kb.kb_name + }} - {{ t('overview.description') }} - {{ kb.description }} + {{ + t("overview.description") + }} + {{ + kb.description + }} - {{ t('overview.emoji') }} - {{ kb.emoji || '📚' }} + {{ + t("overview.emoji") + }} + {{ + kb.emoji || "📚" + }} - {{ t('overview.createdAt') }} - {{ formatDate(kb.created_at) }} + {{ + t("overview.createdAt") + }} + {{ + formatDate(kb.created_at) + }} - {{ t('overview.updatedAt') }} - {{ formatDate(kb.updated_at) }} + {{ + t("overview.updatedAt") + }} + {{ + formatDate(kb.updated_at) + }} + + + + + {{ + t("overview.embeddingModel") + }} + {{ + kb.embedding_provider_id || t("overview.notSet") + }} + + + + + {{ + t("overview.rerankModel") + }} + {{ + kb.rerank_provider_id || t("overview.notSet") + }} - - - {{ t('overview.stats') }} + + + {{ t("overview.stats") }} - - + + +
+ mdi-file-document +
{{ documentCount }}
+
+ {{ t("overview.docCount") }} +
+
+
+
- mdi-file-document -
{{ kb.doc_count || 0 }}
-
{{ t('overview.docCount') }}
+ mdi-text-box +
{{ indexedChunkCount }}
+
+ {{ t("overview.chunkCount") }} +
- +
- mdi-text-box -
{{ kb.chunk_count || 0 }}
-
{{ t('overview.chunkCount') }}
+ mdi-check-circle-outline +
{{ readyDocumentCount }}
+
+ {{ t("overview.readyDocCount") }} +
+
+
+ +
+ mdi-alert-circle-outline +
{{ failedDocumentCount }}
+
+ {{ t("overview.failedDocCount") }} +
+
+
+ +
+ mdi-folder +
{{ sourceFileCount }}
+
+ {{ t("overview.sourceFiles") }} +
+
+
+ +
+ mdi-database +
+ {{ formatFileSize(storageBytes) }} +
+
+ {{ t("overview.storageUsed") }} +
+
- - {{ t('overview.embeddingModel') }} + + + + {{ t("consistency.title") }} +
+ + {{ t("consistency.repair") }} + + + {{ t("maintenance.rebuild") }} + + + {{ t("consistency.run") }} + +
+
- - - - {{ t('overview.embeddingModel') }} - {{ kb.embedding_provider_id || t('overview.notSet') }} - + +
+ {{ + getMaintenanceStageText(kbRebuildProgress.stage) + }} + + {{ kbRebuildProgress.current }} / + {{ kbRebuildProgress.total }} + +
+ +
- - - {{ t('overview.rerankModel') }} - {{ kb.rerank_provider_id || t('overview.notSet') }} - -
+ +
+ + {{ + consistencyReport.summary.healthy + ? t("consistency.healthy") + : t("consistency.unhealthy", { + count: consistencyIssueCount, + }) + }} + + + {{ + t("consistency.checkedAt", { + time: formatDate(consistencyReport.checked_at), + }) + }} + +
+
+ + +
+ {{ consistencyPrecheckMessage }} + + {{ t("consistency.notRunHint") }} + +
+
+ + + +
+
+ {{ consistencyReport.summary.sqlite_document_count }} +
+
+ {{ t("consistency.sqliteDocuments") }} +
+
+
+ +
+
+ {{ consistencyReport.summary.indexed_chunk_count }} +
+
+ {{ t("consistency.indexedChunks") }} +
+
+
+ +
+
+ {{ consistencyReport.summary.document_chunk_count }} +
+
+ {{ t("consistency.documentChunks") }} +
+
+
+ +
+
+ {{ consistencyReport.summary.source_file_count }} +
+
+ {{ t("consistency.sourceFiles") }} +
+
+
+
+ + + + +
+ + mdi-alert-circle-outline + + {{ t(issueType.labelKey) }} + + {{ consistencyReport.summary[issueType.key] || 0 }} + +
+
+ + + + + + {{ formatConsistencyIssueTitle(issue) }} + + + {{ formatConsistencyIssueDetail(issue) }} + + + + +
+
+ + +
+ + + {{ t("tasks.title") }} + + + + + + {{ recentTasksLoadError }} + + + {{ t("tasks.empty") }} + + + + + +
+
@@ -138,7 +537,7 @@ - + @@ -152,86 +551,657 @@ {{ snackbar.text }} +
diff --git a/dashboard/src/views/knowledge-base/KBList.vue b/dashboard/src/views/knowledge-base/KBList.vue index c8abb57cab..1462ff05f4 100644 --- a/dashboard/src/views/knowledge-base/KBList.vue +++ b/dashboard/src/views/knowledge-base/KBList.vue @@ -2,7 +2,7 @@
-

{{ t('list.loading') }}

+

{{ t("list.loading") }}

@@ -14,7 +14,7 @@ @click="navigateToDetail(kb.kb_id)" > -
- {{ kb.description || '暂无描述' }} +
+ {{ kb.description || t("list.noDescription") }}
-
- mdi-close-circle - {{ t('list.initError') }} -
-
{{ kb.init_error }}
+
+ mdi-close-circle + {{ t("list.initError") }} +
+
+ {{ kb.init_error }} +
-
- mdi-file-document - {{ kb.doc_count || 0 }} {{ t('list.documents') }} -
-
- mdi-text-box - {{ kb.chunk_count || 0 }} {{ t('list.chunks') }} -
+
+ mdi-file-document + + {{ getListStats(kb).documentCount }} {{ t("list.documents") }} + +
+
+ mdi-text-box + + {{ getListStats(kb).chunkCount }} {{ t("list.chunks") }} + +
diff --git a/dashboard/src/views/knowledge-base/components/SettingsTab.vue b/dashboard/src/views/knowledge-base/components/SettingsTab.vue index 7d18c305a9..d26496da6b 100644 --- a/dashboard/src/views/knowledge-base/components/SettingsTab.vue +++ b/dashboard/src/views/knowledge-base/components/SettingsTab.vue @@ -1,12 +1,12 @@ diff --git a/dashboard/src/views/knowledge-base/components/TavilyKeyDialog.vue b/dashboard/src/views/knowledge-base/components/TavilyKeyDialog.vue index 37cf9df8c9..f9ff255d43 100644 --- a/dashboard/src/views/knowledge-base/components/TavilyKeyDialog.vue +++ b/dashboard/src/views/knowledge-base/components/TavilyKeyDialog.vue @@ -2,30 +2,38 @@ - 配置 Tavily API Key + {{ t("tavily.title") }}

- 为了使用基于网页的知识库功能,需要提供 Tavily API Key。您可以从 Tavily 官网 获取。 + {{ t("tavily.description") }} + {{ + t("tavily.officialSite") + }}

- 取消 + {{ t("tavily.cancel") }} - - 保存 + + {{ t("tavily.save") }}
@@ -33,77 +41,86 @@ \ No newline at end of file +}; + diff --git a/dashboard/src/views/knowledge-base/index.vue b/dashboard/src/views/knowledge-base/index.vue index 13df70e6fb..bada54f21d 100644 --- a/dashboard/src/views/knowledge-base/index.vue +++ b/dashboard/src/views/knowledge-base/index.vue @@ -56,7 +56,7 @@ const goToList = () => {