diff --git a/.env.example b/.env.example index 03cb4b24..e0c8e8d7 100644 --- a/.env.example +++ b/.env.example @@ -112,7 +112,13 @@ EMBEDDING_TIMEOUT=30 # EMBEDDING_BASE_URL=http://localhost:8000/v1 # ======================================== -# Milvus Configuration +# Vector Backend Configuration +# ======================================== +# Supported values: milvus, pgvector +VECTOR_BACKEND=milvus + +# ======================================== +# Milvus Configuration (VECTOR_BACKEND=milvus) # ======================================== # Milvus host MILVUS_HOST=localhost @@ -128,6 +134,16 @@ MILVUS_PORT=19530 MILVUS_ENTRIES_COLLECTION=entries MILVUS_PREFS_COLLECTION=user_preferences +# ======================================== +# pgvector Configuration (VECTOR_BACKEND=pgvector) +# ======================================== +# Optional override. Falls back to DATABASE_URL when empty. +# Note: pgvector uses its own async pool even when it points at DATABASE_URL. +PGVECTOR_DATABASE_URL= +PGVECTOR_ENTRIES_TABLE=entry_embeddings +PGVECTOR_PREFS_TABLE=user_preference_vectors +PGVECTOR_METADATA_TABLE=vector_store_metadata + # ======================================== # Preference Configuration # ======================================== diff --git a/.github/workflows/ci-backend.yml b/.github/workflows/ci-backend.yml index 8ef43535..a1608b3d 100644 --- a/.github/workflows/ci-backend.yml +++ b/.github/workflows/ci-backend.yml @@ -48,7 +48,7 @@ jobs: runs-on: ubuntu-latest services: postgres: - image: postgres:16-alpine + image: pgvector/pgvector:pg16 env: POSTGRES_DB: glean_test POSTGRES_USER: glean diff --git a/CLAUDE.md b/CLAUDE.md index 12cfcc0e..f5ebfcc3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,6 +38,9 @@ The project includes multiple Docker Compose configurations for different use ca # Basic deployment (without admin dashboard) docker compose up -d +# pgvector backend deployment +docker compose -f docker-compose.pgvector.yml up -d + # Full deployment with admin dashboard docker compose --profile admin up -d @@ -55,11 +58,15 @@ IMAGE_TAG=v0.3.0-alpha.1 docker compose up -d # Start development infrastructure (PostgreSQL, Redis, Milvus) docker compose -f docker-compose.dev.yml up -d +# Start development infrastructure (PostgreSQL with pgvector, Redis) +docker compose -f docker-compose.dev.pgvector.yml up -d + # View logs docker compose -f docker-compose.dev.yml logs -f # Stop services docker compose -f docker-compose.dev.yml down +docker compose -f docker-compose.dev.pgvector.yml down ``` ### Local Development with Override diff --git a/DEPLOY.md b/DEPLOY.md index 4f8b2afd..8b35a148 100644 --- a/DEPLOY.md +++ b/DEPLOY.md @@ -19,13 +19,13 @@ This guide provides comprehensive instructions for deploying Glean in production ## Quick Deployment -### Full Deployment (Recommended) +### Using pgvector (Recommended) -Includes Milvus for Phase 3 features (smart recommendations, preference learning): +Uses PostgreSQL's built-in pgvector extension for vector storage. No additional infrastructure required: ```bash -# Download docker-compose.yml -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml +# Download pgvector compose +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.pgvector.yml -o docker-compose.yml # (Optional) Create .env file to customize admin credentials cat > .env << EOF @@ -50,13 +50,18 @@ docker compose up -d - Password: `Admin123!` - ⚠️ **Change this password in production!** -### Lite Deployment (Without Milvus) +**Next steps**: +1. Log in to admin dashboard at http://localhost:3001 +2. Change the default password +3. Configure additional environment variables for production (see [Environment Configuration](#environment-configuration)) + +### Using Milvus -For lighter deployments if you don't need Phase 3 features: +Uses a dedicated Milvus vector database for vector storage: ```bash -# Download lite version -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.lite.yml -o docker-compose.yml +# Download docker-compose.yml +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml # (Optional) Create .env file to customize admin credentials cat > .env << EOF @@ -65,24 +70,22 @@ ADMIN_PASSWORD=$(openssl rand -base64 24) SECRET_KEY=$(openssl rand -base64 32) EOF -# ⚠️ IMPORTANT: Save the generated passwords +# ⚠️ IMPORTANT: Save the generated passwords before proceeding! cat .env -# Start services +# Start all services docker compose up -d + +# Access: +# - Web App: http://localhost +# - Admin Dashboard: http://localhost:3001 (default: admin / Admin123!) ``` **Default admin account**: If you don't create a `.env` file, the default credentials are: - Username: `admin` - Password: `Admin123!` -- Dashboard: http://localhost:3001 - ⚠️ **Change this password in production!** -**Next steps**: -1. Log in to admin dashboard at http://localhost:3001 -2. Change the default password -3. Configure additional environment variables for production (see [Environment Configuration](#environment-configuration)) - ### Testing Pre-release Versions Pre-release versions (alpha/beta/rc) are available for testing upcoming features: @@ -233,9 +236,43 @@ docker compose logs backend | grep "Admin Account Created" ## Service Architecture -### Full Deployment +### Using pgvector (Recommended) + +Uses PostgreSQL with pgvector extension for vector storage (6 services total). Use `docker-compose.pgvector.yml` for this configuration. + +**Services:** + +| Service | Container Name | Description | Dependencies | +| ---------- | -------------- | ----------------------------------- | ------------------ | +| postgres | glean-postgres | PostgreSQL 16 with pgvector | - | +| redis | glean-redis | Redis 8 for task queue | - | +| backend | glean-backend | FastAPI REST API server | postgres, redis | +| worker | glean-worker | arq background worker (feed sync) | postgres, redis | +| web | glean-web | React web frontend (nginx) | backend | +| admin | glean-admin | Admin dashboard (nginx) | backend | + +**Data persistence:** +- `postgres_data` - PostgreSQL database files (including vector data) +- `redis_data` - Redis persistence (AOF) +- `glean_logs` - Application logs (backend + worker) + +**Networking:** +- All services communicate via `glean-network` bridge network +- Only `web` (port 80) and `admin` (port 3001) are exposed to host + +**Recommended pgvector index:** -Glean consists of 9 services orchestrated by Docker Compose: +Run this after migrations if you expect non-trivial similarity-search volume. Adjust the table name if you customized `PGVECTOR_ENTRIES_TABLE`. + +```sql +CREATE INDEX IF NOT EXISTS idx_entry_embeddings_embedding_hnsw +ON entry_embeddings USING hnsw (embedding vector_cosine_ops) +WITH (m = 16, ef_construction = 64); +``` + +### Using Milvus + +Uses a dedicated Milvus vector database (9 services total). Use `docker-compose.yml` for this configuration. **Core services:** @@ -248,7 +285,7 @@ Glean consists of 9 services orchestrated by Docker Compose: | web | glean-web | React web frontend (nginx) | backend | | admin | glean-admin | Admin dashboard (nginx) | backend | -**Milvus services (Phase 3 features):** +**Milvus services:** | Service | Container Name | Description | Dependencies | | ------------- | ------------------- | ------------------------------ | ------------------ | @@ -263,10 +300,6 @@ Glean consists of 9 services orchestrated by Docker Compose: 4. `web` and `admin` start after backend is ready 5. `milvus-etcd` and `milvus-minio` start in parallel, then `milvus` -### Lite Deployment - -Excludes Milvus services (6 services total). Use `docker-compose.lite.yml` for this configuration. - **Data persistence:** - `postgres_data` - PostgreSQL database files - `redis_data` - Redis persistence (AOF) @@ -317,16 +350,20 @@ Excludes Milvus services (6 services total). Use `docker-compose.lite.yml` for t | `LOG_RETENTION` | `30 days` | Log retention period | | `LOG_COMPRESSION` | `gz` | Log compression format | -### Milvus Configuration (Phase 3 Features) +### Vector Backend Configuration -Milvus is optional and provides vector database capabilities for smart recommendations and preference learning. +Two vector backends are supported: -**Enable Milvus:** -```bash -docker compose --profile milvus up -d -``` +- `pgvector` (in `docker-compose.pgvector.yml`, **recommended** — no extra infrastructure needed) +- `milvus` (in `docker-compose.yml` — for users who prefer a dedicated vector database) + +**Backend selector:** + +| Variable | Default | Description | +| ---------------- | -------- | --------------------------------------------- | +| `VECTOR_BACKEND` | `milvus` | Vector backend (`pgvector` or `milvus`) | -**Milvus connection settings:** +**Milvus connection settings (VECTOR_BACKEND=milvus):** | Variable | Default | Description | | ------------------------- | ----------- | --------------------------------- | @@ -337,9 +374,20 @@ docker compose --profile milvus up -d | `MILVUS_ENTRIES_COLLECTION` | `entries` | Collection name for entry vectors | | `MILVUS_PREFS_COLLECTION` | `user_preferences` | Collection name for user preferences | -### Embedding Configuration (Phase 3 Features) +**pgvector settings (VECTOR_BACKEND=pgvector):** + +| Variable | Default | Description | +| --------------------------- | -------------------------- | -------------------------------------------------------- | +| `PGVECTOR_DATABASE_URL` | - | Optional override, falls back to `DATABASE_URL` | +| `PGVECTOR_ENTRIES_TABLE` | `entry_embeddings` | Table name for entry vectors | +| `PGVECTOR_PREFS_TABLE` | `user_preference_vectors` | Table name for user preference vectors | +| `PGVECTOR_METADATA_TABLE` | `vector_store_metadata` | Table name for model signature metadata | + +When `PGVECTOR_DATABASE_URL` is left empty, Glean connects pgvector to `DATABASE_URL` using its own async pool. That is fine for most installs, but count it separately when tuning PostgreSQL connection limits. + +### Embedding Configuration -Required when using Milvus for smart recommendations: +Required for preference learning and smart recommendations: | Variable | Default | Description | | ---------------------- | ------------------------ | ------------------------------------------------ | diff --git a/DEPLOY.zh-CN.md b/DEPLOY.zh-CN.md index a6180ef4..f392a946 100644 --- a/DEPLOY.zh-CN.md +++ b/DEPLOY.zh-CN.md @@ -19,17 +19,16 @@ ## 快速部署 -### 完整部署(推荐) +### 使用 pgvector(推荐) -包含 Milvus,支持 Phase 3 功能(智能推荐、偏好学习): +使用 PostgreSQL 内置的 pgvector 扩展进行向量存储,无需额外的基础设施: ```bash -# 下载 docker-compose.yml -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml +# 下载 pgvector 版本 +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.pgvector.yml -o docker-compose.yml # 创建 .env 文件并配置管理员凭据(可选但推荐) cat > .env << EOF -CREATE_ADMIN=true ADMIN_USERNAME=admin ADMIN_PASSWORD=$(openssl rand -base64 24) SECRET_KEY=$(openssl rand -base64 32) @@ -43,38 +42,49 @@ docker compose up -d # 访问: # - Web 应用: http://localhost -# - 管理后台: http://localhost:3001 +# - 管理后台: http://localhost:3001(默认:admin / Admin123!) ``` -**重要提示**:请在继续之前安全地保存您的管理员凭据。它们仅在启动时显示一次。 +**默认管理员账号**:如果未创建 `.env` 文件,默认凭据为: +- 用户名:`admin` +- 密码:`Admin123!` +- ⚠️ **生产环境请修改此密码!** + +**后续步骤:** +1. 访问管理后台:http://localhost:3001 +2. 修改默认密码 +3. 配置生产环境的其他环境变量(参见[环境配置](#环境配置)) -### 精简部署(不含 Milvus) +### 使用 Milvus -如果不需要 Phase 3 功能,可以使用精简版: +使用独立的 Milvus 向量数据库进行向量存储: ```bash -# 下载精简版 -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.lite.yml -o docker-compose.yml +# 下载 docker-compose.yml +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml -# (可选)创建 .env 文件并配置管理员凭据 +# 创建 .env 文件并配置管理员凭据(可选但推荐) cat > .env << EOF -CREATE_ADMIN=true ADMIN_USERNAME=admin ADMIN_PASSWORD=$(openssl rand -base64 24) SECRET_KEY=$(openssl rand -base64 32) EOF -# ⚠️ 重要:保存生成的密码 +# ⚠️ 重要:在继续之前保存生成的密码! cat .env -# 启动服务 +# 启动所有服务 docker compose up -d + +# 访问: +# - Web 应用: http://localhost +# - 管理后台: http://localhost:3001(默认:admin / Admin123!) ``` -**后续步骤:** -1. 如果未使用自动创建,请手动创建管理员账号:`docker exec -it glean-backend /app/scripts/create-admin-docker.sh` -2. 访问管理后台:http://localhost:3001 -3. 配置生产环境的其他环境变量(参见[环境配置](#环境配置)) +**默认管理员账号**:如果未创建 `.env` 文件,默认凭据为: +- 用户名:`admin` +- 密码:`Admin123!` +- ⚠️ **生产环境请修改此密码!** ### 测试预发布版本 @@ -227,9 +237,33 @@ docker compose logs backend | grep "Admin Account Created" ## 服务架构 -### 完整部署 +### 使用 pgvector(推荐) + +使用带 pgvector 扩展的 PostgreSQL 进行向量存储(共 6 个服务)。使用 `docker-compose.pgvector.yml` 配置。 + +**服务:** + +| 服务 | 容器名称 | 说明 | 依赖关系 | +| ---------- | -------------- | ------------------------------ | --------------- | +| postgres | glean-postgres | PostgreSQL 16(含 pgvector) | - | +| redis | glean-redis | Redis 8 任务队列 | - | +| backend | glean-backend | FastAPI REST API 服务器 | postgres, redis | +| worker | glean-worker | arq 后台工作进程(订阅源同步) | postgres, redis | +| web | glean-web | React Web 前端(nginx) | backend | +| admin | glean-admin | 管理后台(nginx) | backend | + +**数据持久化:** +- `postgres_data` - PostgreSQL 数据库文件(包含向量数据) +- `redis_data` - Redis 持久化(AOF) +- `glean_logs` - 应用日志(backend + worker) + +**网络:** +- 所有服务通过 `glean-network` 桥接网络通信 +- 仅 `web`(端口 80)和 `admin`(端口 3001)暴露到宿主机 + +### 使用 Milvus -Glean 由 Docker Compose 编排的 9 个服务组成: +使用独立的 Milvus 向量数据库(共 9 个服务)。使用 `docker-compose.yml` 配置。 **核心服务:** @@ -242,7 +276,7 @@ Glean 由 Docker Compose 编排的 9 个服务组成: | web | glean-web | React Web 前端(nginx) | backend | | admin | glean-admin | 管理后台(nginx) | backend | -**Milvus 服务(Phase 3 功能):** +**Milvus 服务:** | 服务 | 容器名称 | 说明 | 依赖关系 | | ------------- | ------------------- | -------------------------- | ------------------ | @@ -257,10 +291,6 @@ Glean 由 Docker Compose 编排的 9 个服务组成: 4. `web` 和 `admin` 在后端就绪后启动 5. `milvus-etcd` 和 `milvus-minio` 并行启动,然后是 `milvus` -### 精简部署 - -不包含 Milvus 服务(共 6 个服务)。使用 `docker-compose.lite.yml` 配置。 - **数据持久化:** - `postgres_data` - PostgreSQL 数据库文件 - `redis_data` - Redis 持久化(AOF) @@ -311,16 +341,29 @@ Glean 由 Docker Compose 编排的 9 个服务组成: | `LOG_RETENTION` | `30 days` | 日志保留期限 | | `LOG_COMPRESSION` | `gz` | 日志压缩格式 | -### Milvus 配置(Phase 3 功能) +### 向量后端配置 -Milvus 是可选的,提供向量数据库功能用于智能推荐和偏好学习。 +支持两种向量后端: -**启用 Milvus:** -```bash -docker compose --profile milvus up -d -``` +- `pgvector`(`docker-compose.pgvector.yml`,**推荐** — 无需额外基础设施) +- `milvus`(`docker-compose.yml` — 适合偏好独立向量数据库的用户) + +**后端选择器:** + +| 变量 | 默认值 | 说明 | +| ---------------- | -------- | --------------------------------- | +| `VECTOR_BACKEND` | `milvus` | 向量后端(`pgvector` 或 `milvus`)| + +**pgvector 配置(VECTOR_BACKEND=pgvector):** + +| 变量 | 默认值 | 说明 | +| --------------------------- | ------------------------ | ----------------------------------- | +| `PGVECTOR_DATABASE_URL` | - | 可选覆盖,默认回退到 `DATABASE_URL` | +| `PGVECTOR_ENTRIES_TABLE` | `entry_embeddings` | 文章向量表名 | +| `PGVECTOR_PREFS_TABLE` | `user_preference_vectors`| 用户偏好向量表名 | +| `PGVECTOR_METADATA_TABLE` | `vector_store_metadata` | 模型签名元数据表名 | -**Milvus 连接设置:** +**Milvus 连接设置(VECTOR_BACKEND=milvus):** | 变量 | 默认值 | 说明 | | --------------------------- | --------------- | ------------------------------ | @@ -331,9 +374,9 @@ docker compose --profile milvus up -d | `MILVUS_ENTRIES_COLLECTION` | `entries` | 文章向量集合名称 | | `MILVUS_PREFS_COLLECTION` | `user_preferences` | 用户偏好集合名称 | -### Embedding 配置(Phase 3 功能) +### Embedding 配置 -使用 Milvus 进行智能推荐时需要配置: +偏好学习和智能推荐功能所需配置: | 变量 | 默认值 | 说明 | | ---------------------- | ----------------------- | ------------------------------------------------- | diff --git a/README.md b/README.md index e8a01327..dc681976 100644 --- a/README.md +++ b/README.md @@ -36,13 +36,15 @@ A self-hosted RSS reader and personal knowledge management tool. ## Quick Start -### One-Command Deployment +### Using pgvector (Recommended) + +Uses PostgreSQL's built-in pgvector extension — no extra infrastructure required: ```bash -# Download docker-compose.yml -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml +# Download pgvector compose +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.pgvector.yml -o docker-compose.yml -# Start Glean (full deployment with Milvus) +# Start Glean docker compose up -d # Access: @@ -55,16 +57,20 @@ docker compose up -d - Password: `Admin123!` - ⚠️ **Change this password in production!** -**Lite Deployment** (without Milvus, if you don't need Phase 3 features): +### Using Milvus + +For deployments that prefer a dedicated Milvus vector database: ```bash -# Download lite version -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.lite.yml -o docker-compose.yml +# Download docker-compose.yml +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml # Start Glean docker compose up -d -# Admin Dashboard: http://localhost:3001 (default: admin/Admin123!) +# Access: +# - Web App: http://localhost +# - Admin Dashboard: http://localhost:3001 (default: admin/Admin123!) ``` ### Customize Admin Account (Optional) @@ -156,16 +162,15 @@ See available pre-release versions on the [Releases page](https://github.com/Les ## Deployment -The default deployment includes all services (full version): +Both deployments include the same core services: - **Web App** (port 80) - Main user interface - **Admin Dashboard** (port 3001) - User management and system monitoring - **Backend API** - FastAPI server - **Worker** - Background task processor (feed fetching, cleanup) - **PostgreSQL** - Database - **Redis** - Task queue -- **Milvus** - Vector database for smart recommendations and preference learning (Phase 3) -**Lite deployment** (without Milvus) is also available using `docker-compose.lite.yml`. +The difference is the vector backend: `docker-compose.pgvector.yml` uses PostgreSQL's pgvector extension (recommended), while `docker-compose.yml` uses a dedicated Milvus vector database. For detailed deployment instructions and configuration, see [DEPLOY.md](DEPLOY.md). @@ -235,17 +240,16 @@ Hooks automatically run on commit and check: | ------------------------- | --------- | -------------------------------------------------------------- | | **Phase 1: MVP** | ✅ Done | User system, RSS subscription, reader, admin dashboard | | **Phase 2: Organization** | ✅ Done | Bookmarks, folders, tags, read later | -| **Phase 3: Preferences** | 🚧 WIP | Embedding pipeline, preference learning, smart recommendations | +| **Phase 3: Preferences** | ✅ Done | Embedding pipeline, preference learning, smart recommendations | | **Phase 4: Rules** | 📋 Planned | Rule engine, Jinja2 conditions, automated actions | | **Phase 5: AI** | 📋 Planned | AI summaries, auto-tagging, keyword extraction, BYOK support | | **Phase 6: Extensions** | 📋 Planned | Chrome extension, PWA, web snapshots | -See **[Product Requirements](./docs/glean-prd-v1.2.md)** for detailed feature specifications. ## Documentation - **[Development Guide](./DEVELOPMENT.md)** - Set up your development environment -- **[Deployment Guide](./deploy/README.md)** - Production deployment details +- **[Deployment Guide](./DEPLOY.md)** - Production deployment details ## Contributing diff --git a/README.zh-CN.md b/README.zh-CN.md index 14c46066..c3e0f654 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -36,13 +36,15 @@ ## 快速开始 -### 一键部署 +### 使用 pgvector(推荐) + +使用 PostgreSQL 内置的 pgvector 扩展,无需额外基础设施: ```bash -# 下载 docker-compose.yml -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml +# 下载 pgvector 版本 +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.pgvector.yml -o docker-compose.yml -# 启动 Glean(完整部署,包含 Milvus) +# 启动 Glean docker compose up -d # 访问: @@ -55,16 +57,20 @@ docker compose up -d - 密码:`Admin123!` - ⚠️ **生产环境请立即修改此密码!** -**精简部署**(不包含 Milvus,如果不需要 Phase 3 功能): +### 使用 Milvus + +适合偏好独立 Milvus 向量数据库的用户: ```bash -# 下载精简版 -curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.lite.yml -o docker-compose.yml +# 下载 docker-compose.yml +curl -fsSL https://raw.githubusercontent.com/LeslieLeung/glean/main/docker-compose.yml -o docker-compose.yml # 启动 Glean docker compose up -d -# 管理后台: http://localhost:3001(默认:admin/Admin123!) +# 访问: +# - Web 应用: http://localhost +# - 管理后台: http://localhost:3001(默认:admin/Admin123!) ``` ### 自定义管理员账号(可选) @@ -156,16 +162,15 @@ IMAGE_TAG=v0.3.0-alpha.1 docker compose up -d ## 部署 -默认部署包含所有服务(完整版): +两种部署方案包含相同的核心服务: - **Web 应用**(端口 80)- 主用户界面 - **管理后台**(端口 3001)- 用户管理和系统监控 - **后端 API** - FastAPI 服务器 - **Worker** - 后台任务处理器(订阅源抓取、清理) - **PostgreSQL** - 数据库 - **Redis** - 任务队列 -- **Milvus** - 向量数据库,用于智能推荐和偏好学习(Phase 3) -**精简部署**(不包含 Milvus)也可使用 `docker-compose.lite.yml`。 +区别在于向量存储:`docker-compose.pgvector.yml` 使用 PostgreSQL 的 pgvector 扩展(推荐),`docker-compose.yml` 使用独立的 Milvus 向量数据库。 详细的部署说明和配置请参见 [DEPLOY.zh-CN.md](DEPLOY.zh-CN.md)。 @@ -212,12 +217,11 @@ make dev-all | --------------------- | -------- | ---------------------------------------- | | **Phase 1: MVP** | ✅ 完成 | 用户系统、RSS 订阅、阅读器、管理后台 | | **Phase 2: 内容组织** | ✅ 完成 | 收藏、文件夹、标签、稍后阅读 | -| **Phase 3: 偏好系统** | 🚧 进行中 | Embedding 管线、偏好学习、智能推荐 | +| **Phase 3: 偏好系统** | ✅ 完成 | Embedding 管线、偏好学习、智能推荐 | | **Phase 4: 规则引擎** | 📋 计划中 | 规则引擎、Jinja2 条件、自动化动作 | | **Phase 5: AI 功能** | 📋 计划中 | AI 摘要、自动打标、关键词提取、BYOK 支持 | | **Phase 6: 扩展功能** | 📋 计划中 | Chrome 扩展、PWA、网页快照 | -详细功能规格请参阅 **[产品需求文档](./docs/glean-prd-v1.2.md)**。 ## 文档 diff --git a/backend/apps/api/glean_api/dependencies.py b/backend/apps/api/glean_api/dependencies.py index aa095d6e..f069ebf3 100644 --- a/backend/apps/api/glean_api/dependencies.py +++ b/backend/apps/api/glean_api/dependencies.py @@ -213,13 +213,14 @@ async def get_preference_service( async def get_score_service( + request: Request, session: Annotated[AsyncSession, Depends(get_session)], ) -> object | None: """ Get score service instance for real-time preference scoring. Returns: - - ScoreService if vectorization is enabled and Milvus is available + - ScoreService if vectorization is enabled and vector backend is available - SimpleScoreService if vectorization is disabled - None if there's an error """ @@ -239,16 +240,16 @@ async def get_score_service( # Vectorization enabled - try to use vector scoring try: - from glean_vector.clients.milvus_client import MilvusClient from glean_vector.services.score_service import ScoreService - milvus_client = MilvusClient() - milvus_client.connect() - await milvus_client.ensure_collections(config.dimension, config.provider, config.model) + vector_client = getattr(request.app.state, "vector_client", None) + if vector_client is None: + return SimpleScoreService(session) - return ScoreService(db_session=session, milvus_client=milvus_client) + await vector_client.ensure_collections(config.dimension, config.provider, config.model) + return ScoreService(db_session=session, vector_client=vector_client) except Exception: - # Milvus not available, fall back to simple scoring + # Vector backend not available, fall back to simple scoring return SimpleScoreService(session) diff --git a/backend/apps/api/glean_api/main.py b/backend/apps/api/glean_api/main.py index 9d165b7d..5cdeb027 100644 --- a/backend/apps/api/glean_api/main.py +++ b/backend/apps/api/glean_api/main.py @@ -20,6 +20,8 @@ from fastapi.middleware.cors import CORSMiddleware from glean_core import get_logger, init_logging +from glean_vector.clients import create_vector_store_client +from glean_vector.config import vector_backend_config from .config import settings from .mcp import create_mcp_server @@ -108,7 +110,9 @@ def create_app( @asynccontextmanager async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: - from glean_database.session import init_database + from glean_core.schemas.config import EmbeddingConfig, VectorizationStatus + from glean_core.services import TypedConfigService + from glean_database.session import get_session_context, init_database logger.info(f"Starting Glean API v{settings.version}") init_database(settings.database_url) @@ -118,6 +122,38 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: _app.state.redis_pool = await create_pool(redis_settings) logger.info("Redis pool initialized") + try: + vector_client = create_vector_store_client() + vector_client.connect() + + async with get_session_context() as session: + config_service = TypedConfigService(session) + config = await config_service.get(EmbeddingConfig) + + if config.enabled and config.status in ( + VectorizationStatus.IDLE, + VectorizationStatus.REBUILDING, + ): + await vector_client.ensure_collections( + config.dimension, + config.provider, + config.model, + ) + + _app.state.vector_client = vector_client + _app.state.vector_client_error = None + logger.info( + "Vector client initialized", + extra={"backend": vector_backend_config.backend}, + ) + except Exception as e: + _app.state.vector_client = None + _app.state.vector_client_error = str(e) + logger.warning( + "Vector client unavailable for API scoring", + extra={"backend": vector_backend_config.backend, "error": str(e)}, + ) + # Run extra startup hook if extra_startup: await extra_startup() @@ -132,6 +168,14 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: if extra_shutdown: await extra_shutdown() finally: + vector_client = getattr(_app.state, "vector_client", None) + if vector_client: + vector_client.disconnect() + _app.state.vector_client = None + logger.info( + "Vector client disconnected", + extra={"backend": vector_backend_config.backend}, + ) redis_pool = getattr(_app.state, "redis_pool", None) if redis_pool: await redis_pool.close() diff --git a/backend/apps/api/glean_api/routers/admin.py b/backend/apps/api/glean_api/routers/admin.py index dcc0bc91..e8bccdd4 100644 --- a/backend/apps/api/glean_api/routers/admin.py +++ b/backend/apps/api/glean_api/routers/admin.py @@ -757,8 +757,15 @@ async def update_embedding_config( if updated.enabled and config_changed: # Generate new version and trigger rebuild await config_service.update_embedding_version() - await config_service.update(EmbeddingConfig, status=VectorizationStatus.VALIDATING) + await config_service.update( + EmbeddingConfig, + status=VectorizationStatus.VALIDATING, + last_error=None, + error_count=0, + ) await redis_pool.enqueue_job("validate_and_rebuild_embeddings") + # Re-read for response + updated = await config_service.get(EmbeddingConfig) return TypedEmbeddingConfigResponse.from_config(updated) @@ -772,7 +779,7 @@ async def enable_embedding( """ Enable vectorization. - Validates provider and Milvus connection, then triggers rebuild. + Validates provider and configured vector backend, then triggers rebuild. """ from glean_core.schemas.config import ( EmbeddingConfig, @@ -789,11 +796,13 @@ async def enable_embedding( if config.enabled: return TypedEmbeddingConfigResponse.from_config(config) - # Enable and set to validating + # Enable and set to validating, clear stale error fields updated = await config_service.update( EmbeddingConfig, enabled=True, status=VectorizationStatus.VALIDATING, + last_error=None, + error_count=0, ) # Trigger validation and rebuild in background @@ -839,7 +848,7 @@ async def validate_embedding_config( session: Annotated[AsyncSession, Depends(get_session)], ) -> dict[str, Any]: """ - Test provider and Milvus connection without saving. + Test provider and configured vector backend connection without saving. Returns validation results. """ @@ -889,10 +898,15 @@ async def trigger_embedding_rebuild( detail="Rebuild already in progress", ) - # Generate new version and trigger rebuild + # Generate new version and trigger rebuild, clear stale error fields await config_service.update_embedding_version() - await config_service.update(EmbeddingConfig, status=VectorizationStatus.VALIDATING) - await redis_pool.enqueue_job("validate_and_rebuild_embeddings") + await config_service.update( + EmbeddingConfig, + status=VectorizationStatus.VALIDATING, + last_error=None, + error_count=0, + ) + await redis_pool.enqueue_job("validate_and_rebuild_embeddings", True) return {"message": "Rebuild triggered", "status": "validating"} @@ -949,15 +963,15 @@ async def get_embedding_status( # Get progress from entry counts progress = await admin_service.get_embedding_progress() - # Auto-complete rebuild if all entries are processed + # Auto-complete rebuild if all entries reached terminal state current_status = config.status if config.status == VectorizationStatus.REBUILDING: - total = progress.get("total", 0) + pending = progress.get("pending", 0) + processing = progress.get("processing", 0) done = progress.get("done", 0) failed = progress.get("failed", 0) - # If all entries are processed (done + failed == total), mark as complete - if total > 0 and (done + failed) >= total: + if pending == 0 and processing == 0 and (done + failed) > 0: await config_service.complete_rebuild() current_status = VectorizationStatus.IDLE diff --git a/backend/apps/api/glean_api/routers/feeds.py b/backend/apps/api/glean_api/routers/feeds.py index b6493b97..b546a7d1 100644 --- a/backend/apps/api/glean_api/routers/feeds.py +++ b/backend/apps/api/glean_api/routers/feeds.py @@ -248,7 +248,7 @@ async def delete_subscription( subscription_id, current_user.id ) - # Queue Milvus embedding cleanup if feed was orphaned + # Queue vector embedding cleanup if feed was orphaned if orphaned_feed_id and entry_ids: await redis.enqueue_job("cleanup_orphan_embeddings", orphaned_feed_id, entry_ids) except ValueError as e: @@ -280,7 +280,7 @@ async def batch_delete_subscriptions( data.subscription_ids, current_user.id ) - # Queue Milvus embedding cleanup for each orphaned feed + # Queue vector embedding cleanup for each orphaned feed for feed_id, entry_ids in orphaned_feeds.items(): if entry_ids: await redis.enqueue_job("cleanup_orphan_embeddings", feed_id, entry_ids) diff --git a/backend/apps/api/glean_api/routers/system.py b/backend/apps/api/glean_api/routers/system.py index 4ecdc4a5..364fe1a0 100644 --- a/backend/apps/api/glean_api/routers/system.py +++ b/backend/apps/api/glean_api/routers/system.py @@ -54,40 +54,28 @@ async def get_vectorization_status( current_status = config.status if config.status == VectorizationStatus.REBUILDING: - # Get entry counts to check progress - total_result = await session.execute(select(func.count()).select_from(Entry)) - total = total_result.scalar_one() - - done_result = await session.execute( - select(func.count()).select_from(Entry).where(Entry.embedding_status == "done") - ) - done = done_result.scalar_one() - - failed_result = await session.execute( - select(func.count()).select_from(Entry).where(Entry.embedding_status == "failed") + result = await session.execute( + select(Entry.embedding_status, func.count()) + .where(Entry.embedding_status.in_(["pending", "processing", "done", "failed"])) + .group_by(Entry.embedding_status) ) - failed = failed_result.scalar_one() + counts: dict[str, int] = {str(row[0]): int(row[1]) for row in result.all()} - pending_result = await session.execute( - select(func.count()).select_from(Entry).where(Entry.embedding_status == "pending") - ) - pending = pending_result.scalar_one() - - processing_result = await session.execute( - select(func.count()).select_from(Entry).where(Entry.embedding_status == "processing") - ) - processing = processing_result.scalar_one() + pending = counts.get("pending", 0) + processing = counts.get("processing", 0) + done = counts.get("done", 0) + failed = counts.get("failed", 0) progress = EmbeddingRebuildProgress( - total=total, + total=pending + processing + done + failed, pending=pending, processing=processing, done=done, failed=failed, ) - # Auto-complete rebuild if all entries are processed - if total > 0 and (done + failed) >= total: + # Auto-complete rebuild when all entries reached terminal state + if pending == 0 and processing == 0 and (done + failed) > 0: await config_service.complete_rebuild() current_status = VectorizationStatus.IDLE diff --git a/backend/apps/api/tests/test_score_dependency.py b/backend/apps/api/tests/test_score_dependency.py new file mode 100644 index 00000000..96e037f2 --- /dev/null +++ b/backend/apps/api/tests/test_score_dependency.py @@ -0,0 +1,76 @@ +"""Tests for score-service dependency behavior.""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from glean_api.dependencies import get_score_service +from glean_core.schemas.config import EmbeddingConfig, VectorizationStatus +from glean_vector.services.score_service import ScoreService + + +@pytest.mark.asyncio +async def test_get_score_service_reuses_app_scoped_vector_client() -> None: + """Should reuse the app-scoped vector client after ensuring storage.""" + session = AsyncMock() + vector_client = AsyncMock() + request = SimpleNamespace( + app=SimpleNamespace(state=SimpleNamespace(vector_client=vector_client)) + ) + config = EmbeddingConfig( + enabled=True, + status=VectorizationStatus.IDLE, + provider="openai", + model="text-embedding-3-small", + dimension=1536, + ) + + with ( + patch("glean_core.services.TypedConfigService") as config_service_cls, + patch("glean_vector.services.score_service.ScoreService") as score_service_cls, + ): + config_service = AsyncMock() + config_service.get.return_value = config + config_service_cls.return_value = config_service + + sentinel_service = MagicMock(spec=ScoreService) + score_service_cls.return_value = sentinel_service + + service = await get_score_service(request=request, session=session) + + assert service is sentinel_service + vector_client.ensure_collections.assert_awaited_once_with( + config.dimension, + config.provider, + config.model, + ) + score_service_cls.assert_called_once_with( + db_session=session, + vector_client=vector_client, + ) + + +@pytest.mark.asyncio +async def test_get_score_service_falls_back_when_vector_client_missing() -> None: + """Should return SimpleScoreService when startup did not initialize a client.""" + session = AsyncMock() + request = SimpleNamespace(app=SimpleNamespace(state=SimpleNamespace(vector_client=None))) + config = EmbeddingConfig( + enabled=True, + status=VectorizationStatus.IDLE, + provider="openai", + model="text-embedding-3-small", + dimension=1536, + ) + + with patch("glean_core.services.TypedConfigService") as config_service_cls: + config_service = AsyncMock() + config_service.get.return_value = config + config_service_cls.return_value = config_service + + service = await get_score_service(request=request, session=session) + + from glean_core.services import SimpleScoreService + + assert isinstance(service, SimpleScoreService) diff --git a/backend/apps/worker/glean_worker/main.py b/backend/apps/worker/glean_worker/main.py index 53014981..a1724443 100644 --- a/backend/apps/worker/glean_worker/main.py +++ b/backend/apps/worker/glean_worker/main.py @@ -14,12 +14,14 @@ from glean_core import get_logger, init_logging from glean_database.session import init_database -from glean_vector.clients.milvus_client import MilvusClient +from glean_vector.clients.vector_store import create_vector_store_client +from glean_vector.config import vector_backend_config from .config import settings from .tasks import ( bookmark_metadata, cleanup, + embedding_maintenance, embedding_rebuild, embedding_worker, feed_fetcher, @@ -58,29 +60,27 @@ async def startup(ctx: dict[str, Any]) -> None: # The redis client is automatically available in the worker context logger.info("Redis client available for distributed locks") - # Initialize Milvus client (M3) - optional for embedding/preference features - from glean_vector.config import milvus_config - - # Check if Milvus is explicitly configured (not just default localhost) - milvus_configured = milvus_config.host and milvus_config.host != "localhost" - - if milvus_configured or milvus_config.host == "localhost": - # Try to connect even for localhost (might be intentional dev setup) - logger.info(f"Attempting to connect to Milvus at {milvus_config.host}:{milvus_config.port}") - milvus_client = MilvusClient() - try: - milvus_client.connect() - ctx["milvus_client"] = milvus_client - logger.info("✓ Milvus client connected successfully") - except Exception as e: - logger.warning(f"✗ Failed to connect to Milvus: {e}") - logger.info( - "Worker will continue without Milvus - embedding and preference tasks will be skipped" - ) - ctx["milvus_client"] = None - else: - logger.info("Milvus not configured - embedding and preference features disabled") - ctx["milvus_client"] = None + # Initialize vector backend client (M3) - optional for embedding/preference features + try: + vector_client = create_vector_store_client() + vector_client.connect() + ctx["vector_client"] = vector_client + ctx["vector_client_error"] = None + logger.info( + "✓ Vector client connected successfully", + extra={"backend": vector_backend_config.backend}, + ) + except Exception as e: + logger.warning( + "✗ Failed to connect to vector backend", + extra={"backend": vector_backend_config.backend, "error": str(e)}, + ) + logger.info( + "Worker will continue without vector backend - embedding and preference tasks " + "will be skipped" + ) + ctx["vector_client"] = None + ctx["vector_client_error"] = str(e) # Dynamically log registered task functions logger.info("Registered task functions:") @@ -110,11 +110,11 @@ async def shutdown(ctx: dict[str, Any]) -> None: logger.info("=" * 60) logger.info("Shutting down Glean Worker") - # Disconnect Milvus client - milvus_client = ctx.get("milvus_client") - if milvus_client: - milvus_client.disconnect() - logger.info("Milvus client disconnected") + # Disconnect vector client + vector_client = ctx.get("vector_client") + if vector_client: + vector_client.disconnect() + logger.info("Vector client disconnected", extra={"backend": vector_backend_config.backend}) logger.info("=" * 60) @@ -137,6 +137,8 @@ def get_oss_functions() -> list[TaskFunction]: preference_worker.rebuild_user_preference, # Subscription cleanup subscription_cleanup.cleanup_orphan_embeddings, + # Embedding maintenance + embedding_maintenance.scheduled_embedding_maintenance, ] @@ -147,6 +149,11 @@ def get_oss_cron_jobs() -> list[CronJob]: cron(feed_fetcher.scheduled_fetch, minute={0, 15, 30, 45}), # Read-later cleanup (hourly at minute 0) cron(cleanup.scheduled_cleanup, minute=0), + # Embedding maintenance: recover stuck entries + auto-complete rebuild (every 5 min) + cron( + embedding_maintenance.scheduled_embedding_maintenance, + minute={3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58}, + ), ] diff --git a/backend/apps/worker/glean_worker/tasks/__init__.py b/backend/apps/worker/glean_worker/tasks/__init__.py index b2736226..d970595d 100644 --- a/backend/apps/worker/glean_worker/tasks/__init__.py +++ b/backend/apps/worker/glean_worker/tasks/__init__.py @@ -7,6 +7,7 @@ from . import ( bookmark_metadata, cleanup, + embedding_maintenance, embedding_rebuild, embedding_worker, feed_fetcher, @@ -20,6 +21,7 @@ "bookmark_metadata", "embedding_worker", "embedding_rebuild", + "embedding_maintenance", "preference_worker", "subscription_cleanup", ] diff --git a/backend/apps/worker/glean_worker/tasks/_vector_client.py b/backend/apps/worker/glean_worker/tasks/_vector_client.py new file mode 100644 index 00000000..7bf2d2e8 --- /dev/null +++ b/backend/apps/worker/glean_worker/tasks/_vector_client.py @@ -0,0 +1,43 @@ +"""Vector client lifecycle helpers for worker tasks.""" + +from __future__ import annotations + +from typing import Any + +from glean_core import get_logger +from glean_vector.clients.vector_store import VectorStoreClient, create_vector_store_client +from glean_vector.config import vector_backend_config + +logger = get_logger(__name__) + + +def ensure_vector_client(ctx: dict[str, Any]) -> tuple[VectorStoreClient | None, str | None]: + """ + Ensure a usable vector client exists in worker context. + + Returns: + (client, error_message). error_message is set when client is unavailable. + """ + client = ctx.get("vector_client") + if client is not None: + return client, None + + try: + client = create_vector_store_client() + client.connect() + ctx["vector_client"] = client + ctx["vector_client_error"] = None + logger.info( + "Vector client initialized on demand", + extra={"backend": vector_backend_config.backend}, + ) + return client, None + except Exception as exc: + error_message = str(exc) + ctx["vector_client"] = None + ctx["vector_client_error"] = error_message + logger.warning( + "Failed to initialize vector client on demand", + extra={"backend": vector_backend_config.backend, "error": error_message}, + ) + return None, error_message diff --git a/backend/apps/worker/glean_worker/tasks/embedding_maintenance.py b/backend/apps/worker/glean_worker/tasks/embedding_maintenance.py new file mode 100644 index 00000000..ca66b18c --- /dev/null +++ b/backend/apps/worker/glean_worker/tasks/embedding_maintenance.py @@ -0,0 +1,183 @@ +"""Embedding maintenance tasks. + +Periodic housekeeping for the embedding/vectorization system: +- Recover entries stuck in 'processing' state (worker crash / job timeout) +- Re-enqueue batch jobs for remaining pending entries during a rebuild +- Auto-complete rebuild when all entries reach a terminal state +""" + +from datetime import UTC, datetime, timedelta +from typing import Any +from uuid import uuid4 + +from sqlalchemy import CursorResult, func, select, update +from sqlalchemy.ext.asyncio import AsyncSession + +from glean_core import get_logger +from glean_core.schemas.config import EmbeddingConfig, VectorizationStatus +from glean_core.services import TypedConfigService +from glean_database.models import Entry +from glean_database.session import get_session_context + +logger = get_logger(__name__) + +# Entries stuck in 'processing' longer than this are considered orphaned. +# Should be comfortably larger than the worker job_timeout (300s by default). +STUCK_PROCESSING_THRESHOLD = timedelta(minutes=10) + +MAINTENANCE_BATCH_SIZE = 200 + + +async def _recover_stuck_entries(session: AsyncSession) -> int: + """Reset entries stuck in 'processing' state back to 'pending'. + + An entry is considered stuck when it has been in 'processing' for longer + than STUCK_PROCESSING_THRESHOLD, indicating the worker job that was + handling it has crashed or timed out. + + Returns: + Number of entries recovered. + """ + threshold = datetime.now(UTC) - STUCK_PROCESSING_THRESHOLD + cursor_result: CursorResult[Any] = await session.execute( # type: ignore[assignment] + update(Entry) + .where( + Entry.embedding_status == "processing", + Entry.updated_at < threshold, + ) + .values( + embedding_status="pending", + embedding_error="Recovered from stuck processing state", + ) + ) + count = cursor_result.rowcount if cursor_result.rowcount else 0 + if count > 0: + await session.commit() + logger.info( + "Recovered stuck embedding entries", + extra={ + "count": count, + "threshold_minutes": STUCK_PROCESSING_THRESHOLD.total_seconds() / 60, + }, + ) + return count + + +async def _get_rebuild_counts( + session: AsyncSession, +) -> tuple[EmbeddingConfig, dict[str, int]] | None: + """Return (config, status_counts) when status is REBUILDING, else None.""" + config_service = TypedConfigService(session) + config = await config_service.get(EmbeddingConfig) + + if config.status != VectorizationStatus.REBUILDING: + return None + + result = await session.execute( + select(Entry.embedding_status, func.count()) + .where(Entry.embedding_status.in_(["pending", "processing", "done", "failed"])) + .group_by(Entry.embedding_status) + ) + counts: dict[str, int] = {str(row[0]): int(row[1]) for row in result.all()} + return config, counts + + +async def _re_enqueue_pending(redis: Any, pending: int) -> int: + """Enqueue batch jobs for remaining pending entries. + + This is the safety net for the rebuild: if the initial batch of jobs + finishes (or partially fails) and pending entries remain, we enqueue + more work so the rebuild can make progress. + + Returns: + Number of batch jobs enqueued. + """ + if pending <= 0: + return 0 + + num_batches = max(1, (pending + MAINTENANCE_BATCH_SIZE - 1) // MAINTENANCE_BATCH_SIZE) + batch_prefix = f"maint_{uuid4().hex}" + for i in range(num_batches): + await redis.enqueue_job( + "batch_generate_embeddings", + MAINTENANCE_BATCH_SIZE, + _job_id=f"{batch_prefix}_{i}", + ) + + logger.info( + "Re-enqueued batch embedding jobs from maintenance", + extra={"pending": pending, "num_batches": num_batches}, + ) + return num_batches + + +async def _check_rebuild_completion(session: AsyncSession) -> bool: + """Check if REBUILDING state should transition to IDLE. + + The rebuild is complete when no entries remain in 'pending' or 'processing' + state — i.e. every entry has reached a terminal state ('done' or 'failed'). + + Returns: + True if rebuild was completed by this call. + """ + rebuild_info = await _get_rebuild_counts(session) + if rebuild_info is None: + return False + + _config, counts = rebuild_info + pending = counts.get("pending", 0) + processing = counts.get("processing", 0) + done = counts.get("done", 0) + failed = counts.get("failed", 0) + + if pending > 0 or processing > 0: + logger.debug( + "Rebuild still in progress", + extra={"pending": pending, "processing": processing}, + ) + return False + + if (done + failed) == 0: + return False + + config_service = TypedConfigService(session) + await config_service.complete_rebuild() + logger.info( + "Rebuild completed automatically by maintenance cron", + extra={"done": done, "failed": failed}, + ) + return True + + +async def scheduled_embedding_maintenance(ctx: dict[str, Any]) -> dict[str, Any]: + """Periodic maintenance for the embedding system. + + Runs as an arq cron job. Performs three checks: + 1. Recovers entries stuck in 'processing' (worker crash / timeout). + 2. Re-enqueues batch jobs for remaining pending entries during a rebuild. + 3. Transitions REBUILDING -> IDLE when all entries are processed. + """ + redis = ctx.get("redis") + + async with get_session_context() as session: + recovered = await _recover_stuck_entries(session) + + # If in REBUILDING state, check whether we need to push more work + # or whether the rebuild is done. + re_enqueued = 0 + rebuild_info = await _get_rebuild_counts(session) + if rebuild_info is not None: + _config, counts = rebuild_info + pending = counts.get("pending", 0) + processing = counts.get("processing", 0) + + if pending > 0 and processing == 0 and redis is not None: + re_enqueued = await _re_enqueue_pending(redis, pending) + + completed = await _check_rebuild_completion(session) + + return { + "recovered_stuck_entries": recovered, + "re_enqueued_batches": re_enqueued, + "rebuild_completed": completed, + } diff --git a/backend/apps/worker/glean_worker/tasks/embedding_rebuild.py b/backend/apps/worker/glean_worker/tasks/embedding_rebuild.py index a32e5887..11d85363 100644 --- a/backend/apps/worker/glean_worker/tasks/embedding_rebuild.py +++ b/backend/apps/worker/glean_worker/tasks/embedding_rebuild.py @@ -1,22 +1,49 @@ """Embedding rebuild task.""" +import contextlib from typing import Any +from uuid import uuid4 -from sqlalchemy import select, update +from sqlalchemy import func, select, update +from sqlalchemy.exc import IntegrityError +from sqlalchemy.ext.asyncio import AsyncSession from glean_core import get_logger -from glean_core.schemas.config import EmbeddingConfig as EmbeddingConfigSchema -from glean_core.schemas.config import VectorizationStatus +from glean_core.redis_keys import RedisKeys from glean_core.services import TypedConfigService from glean_core.services.system_config_service import SystemConfigService from glean_database.models import Entry, UserPreferenceStats from glean_database.session import get_session_context -from glean_vector.clients.milvus_client import MilvusClient from glean_vector.config import EmbeddingConfig as EmbeddingSettings from glean_vector.config import embedding_config as env_embedding_config +from ._vector_client import ensure_vector_client + logger = get_logger(__name__) +REBUILD_BATCH_SIZE = 200 + + +async def _find_duplicate_entry_ids(session: AsyncSession) -> list[str]: + """Return entry IDs that belong to duplicate (feed_id, guid) groups.""" + duplicate_pairs = ( + select(Entry.feed_id, Entry.guid) + .where(Entry.guid.is_not(None)) + .group_by(Entry.feed_id, Entry.guid) + .having(func.count(Entry.id) > 1) + .subquery() + ) + + duplicate_ids_result = await session.execute( + select(Entry.id) + .join( + duplicate_pairs, + (Entry.feed_id == duplicate_pairs.c.feed_id) & (Entry.guid == duplicate_pairs.c.guid), + ) + .order_by(Entry.feed_id, Entry.guid, Entry.created_at, Entry.id) + ) + return [row[0] for row in duplicate_ids_result.all()] + async def rebuild_embeddings( ctx: dict[str, Any], config: dict[str, Any] | None = None @@ -27,30 +54,47 @@ async def rebuild_embeddings( Steps: 1) Load embedding config (payload passed or system config / env fallback) 2) Update status to REBUILDING - 3) Recreate Milvus collections if dimension changed + 3) Recreate vector collections if dimension changed 4) Mark all entries pending 5) Enqueue embedding jobs in batches 6) Enqueue user preference rebuild jobs 7) Keep status as REBUILDING (will be set to IDLE when all done) """ - milvus_client: MilvusClient | None = ctx.get("milvus_client") - if not milvus_client: - return {"success": False, "error": "Milvus unavailable"} + vector_client, vector_error = ensure_vector_client(ctx) + if not vector_client: + if vector_error: + return {"success": False, "error": f"Vector backend unavailable: {vector_error}"} + return {"success": False, "error": "Vector backend unavailable"} redis = ctx.get("redis") if not redis: return {"success": False, "error": "Redis unavailable"} + # Distributed lock prevents concurrent rebuilds (e.g. duplicate job enqueue) + lock = redis.lock(RedisKeys.REBUILD_LOCK_KEY, timeout=RedisKeys.REBUILD_LOCK_TIMEOUT) + acquired = await lock.acquire(blocking=False) + if not acquired: + logger.warning("Rebuild already in progress (lock held), skipping") + return {"success": False, "error": "Rebuild already in progress"} + + try: + return await _rebuild_embeddings_locked(vector_client, redis, config) + finally: + with contextlib.suppress(Exception): + await lock.release() + + +async def _rebuild_embeddings_locked( + vector_client: Any, + redis: Any, + config: dict[str, Any] | None, +) -> dict[str, Any]: + """Inner rebuild logic — must be called with the distributed lock held.""" async with get_session_context() as session: - # Update status to REBUILDING so embedding tasks can proceed + # Mark rebuild started (sets rebuild_id, rebuild_started_at, status=REBUILDING) config_service = TypedConfigService(session) - await config_service.update( - EmbeddingConfigSchema, - status=VectorizationStatus.REBUILDING, - ) - # Commit status change BEFORE modifying Milvus to prevent inconsistent state - # If recreate_collections succeeds but later operations fail, - # we need the REBUILDING status to be persisted so that retries work correctly + await config_service.start_rebuild() + # Commit status change BEFORE modifying vector data to prevent inconsistent state await session.commit() # Load config @@ -59,7 +103,6 @@ async def rebuild_embeddings( config = await scs.get_config("embedding.config") if not config: - # Fallback to env defaults env_conf = env_embedding_config.model_dump() config = { "provider": env_conf["provider"], @@ -75,34 +118,67 @@ async def rebuild_embeddings( ) dimension = settings.dimension - # Recreate Milvus collections (drop + create) for new model - # This also drops user_preferences collection, so we need to rebuild them - # NOTE: This is a point of no return - if this succeeds, old embeddings are gone. - # The REBUILDING status is already committed, so retries will work correctly. - await milvus_client.recreate_collections(dimension, settings.provider, settings.model) - logger.info(f"Recreated Milvus collections with dimension={dimension}") - - # Mark all entries pending for new model (new transaction) - await session.execute( - update(Entry).values( - embedding_status="pending", - embedding_error=None, + # Recreate vector storage (drop + create) for new model + # NOTE: Point of no return — old embeddings are gone after this. + await vector_client.recreate_collections(dimension, settings.provider, settings.model) + logger.info(f"Recreated vector collections with dimension={dimension}") + + # Mark all entries pending for new model + # If historical duplicate (feed_id, guid) rows exist, a full-table update can + # trigger uq_feed_guid conflicts. We skip those rows as a safe fallback. + duplicate_entry_ids: list[str] = [] + try: + await session.execute( + update(Entry).values( + embedding_status="pending", + embedding_error=None, + ) + ) + except IntegrityError: + # Rollback FIRST — asyncpg rejects any SQL while a transaction is in + # the failed state, so we must clear it before querying for duplicates. + await session.rollback() + duplicate_entry_ids = await _find_duplicate_entry_ids(session) + if not duplicate_entry_ids: + raise + + logger.warning( + "Detected duplicate (feed_id, guid) entries during rebuild; " + "skipping them for this run", + extra={"duplicate_entry_count": len(duplicate_entry_ids)}, + ) + await session.execute( + update(Entry) + .where(Entry.id.not_in(duplicate_entry_ids)) + .values( + embedding_status="pending", + embedding_error=None, + ) ) - ) await session.commit() - # Enqueue embedding jobs in batches - total_result = await session.execute(select(Entry.id)) - entry_ids = [row[0] for row in total_result.all()] - - for entry_id in entry_ids: - await redis.enqueue_job("generate_entry_embedding", entry_id) + # Count pending entries + total_result = await session.execute( + select(func.count()).select_from(Entry).where(Entry.embedding_status == "pending") + ) + total_pending: int = total_result.scalar_one() + + # Enqueue batch embedding jobs (much more efficient than one job per entry) + num_batches = max(1, (total_pending + REBUILD_BATCH_SIZE - 1) // REBUILD_BATCH_SIZE) + batch_prefix = f"rebuild_{uuid4().hex}" + for i in range(num_batches): + await redis.enqueue_job( + "batch_generate_embeddings", + REBUILD_BATCH_SIZE, + _job_id=f"{batch_prefix}_{i}", + ) - logger.info(f"Enqueued {len(entry_ids)} embedding jobs") + logger.info( + "Enqueued batch embedding jobs for rebuild", + extra={"total_pending": total_pending, "num_batches": num_batches}, + ) - # Enqueue user preference rebuild jobs for all users with preference data - # User preference vectors were deleted when collections were recreated, - # so we need to rebuild them from historical feedback + # Enqueue user preference rebuild jobs users_result = await session.execute(select(UserPreferenceStats.user_id).distinct()) user_ids = [row[0] for row in users_result.all()] @@ -111,13 +187,14 @@ async def rebuild_embeddings( logger.info(f"Enqueued {len(user_ids)} preference rebuild jobs") - # Keep status as REBUILDING - the status API will automatically - # update to IDLE when all pending entries are processed - # Note: We don't set to IDLE here because tasks are still in queue + # Status stays REBUILDING — the maintenance cron will transition + # to IDLE when all entries reach a terminal state. return { "success": True, - "queued_entries": len(entry_ids), + "queued_entries": total_pending, + "queued_batches": num_batches, "queued_preferences": len(user_ids), "dimension": dimension, + "skipped_duplicate_entries": len(duplicate_entry_ids), } diff --git a/backend/apps/worker/glean_worker/tasks/embedding_worker.py b/backend/apps/worker/glean_worker/tasks/embedding_worker.py index b6cca19f..05d704ba 100644 --- a/backend/apps/worker/glean_worker/tasks/embedding_worker.py +++ b/backend/apps/worker/glean_worker/tasks/embedding_worker.py @@ -9,10 +9,11 @@ from glean_core.services import TypedConfigService from glean_database.session import get_session_context from glean_vector.clients.embedding_client import EmbeddingClient -from glean_vector.clients.milvus_client import MilvusClient from glean_vector.config import EmbeddingConfig as EmbeddingSettings from glean_vector.services.embedding_service import EmbeddingService +from ._vector_client import ensure_vector_client + logger = get_logger(__name__) # Circuit breaker state @@ -91,6 +92,23 @@ async def _reset_error_count(session: AsyncSession) -> None: await config_service.update(EmbeddingConfig, error_count=0) +async def _safe_handle_error(session: AsyncSession, error: Exception) -> None: + """Handle embedding error, recovering from a poisoned session first. + + When a prior DB operation fails, asyncpg puts the connection into a + failed-transaction state. This helper rolls back before touching the + session so the circuit-breaker logic can still read/write config. + """ + import contextlib + + with contextlib.suppress(Exception): + await session.rollback() + try: + await _handle_embedding_error(session, error) + except Exception: + logger.warning("Could not update circuit breaker after error", exc_info=True) + + async def generate_entry_embedding(ctx: dict[str, Any], entry_id: str) -> dict[str, Any]: """ Generate embedding for a single entry. @@ -102,9 +120,12 @@ async def generate_entry_embedding(ctx: dict[str, Any], entry_id: str) -> dict[s Returns: Result dictionary """ - milvus_client: MilvusClient | None = ctx.get("milvus_client") - if not milvus_client: - return {"success": False, "entry_id": entry_id, "error": "Milvus unavailable"} + vector_client, vector_error = ensure_vector_client(ctx) + if not vector_client: + error = "Vector backend unavailable" + if vector_error: + error = f"{error}: {vector_error}" + return {"success": False, "entry_id": entry_id, "error": error} async with get_session_context() as session: # Check if vectorization is enabled @@ -117,33 +138,38 @@ async def generate_entry_embedding(ctx: dict[str, Any], entry_id: str) -> dict[s embedding_client = EmbeddingClient(config=settings, rate_limit=rate_limit) try: - # Ensure Milvus collections exist with correct model - await milvus_client.ensure_collections( + # Ensure vector storage exists with correct model config + await vector_client.ensure_collections( settings.dimension, settings.provider, settings.model ) embedding_service = EmbeddingService( db_session=session, embedding_client=embedding_client, - milvus_client=milvus_client, + vector_client=vector_client, ) success = await embedding_service.generate_embedding(entry_id) if success: - # Reset error count on success await _reset_error_count(session) return {"success": success, "entry_id": entry_id} except Exception as e: + # Infrastructure error (API / vector backend). The entry is + # already marked "failed" by the service layer. Count toward + # the circuit breaker but do NOT re-raise — arq retries are + # wasteful when the backend is down; the entry will be picked + # up later by retry_failed_embeddings or the next rebuild. error_msg = str(e) logger.error( f"Failed to generate embedding for entry {entry_id}: {error_msg}", - exc_info=True, # Include full traceback + exc_info=True, ) - await _handle_embedding_error(session, e) - raise + # Session may be in a failed transaction state; rollback first. + await _safe_handle_error(session, e) + return {"success": False, "entry_id": entry_id, "error": error_msg} finally: await embedding_client.close() @@ -160,9 +186,12 @@ async def batch_generate_embeddings(ctx: dict[str, Any], limit: int = 100) -> di Returns: Result dictionary with processed and failed counts """ - milvus_client: MilvusClient | None = ctx.get("milvus_client") - if not milvus_client: - return {"processed": 0, "failed": 0, "error": "Milvus unavailable"} + vector_client, vector_error = ensure_vector_client(ctx) + if not vector_client: + error = "Vector backend unavailable" + if vector_error: + error = f"{error}: {vector_error}" + return {"processed": 0, "failed": 0, "error": error} async with get_session_context() as session: # Check if vectorization is enabled @@ -175,29 +204,38 @@ async def batch_generate_embeddings(ctx: dict[str, Any], limit: int = 100) -> di embedding_client = EmbeddingClient(config=settings, rate_limit=rate_limit) try: - # Ensure Milvus collections exist with correct model - await milvus_client.ensure_collections( + # Ensure vector storage exists with correct model config + await vector_client.ensure_collections( settings.dimension, settings.provider, settings.model ) embedding_service = EmbeddingService( db_session=session, embedding_client=embedding_client, - milvus_client=milvus_client, + vector_client=vector_client, ) result = await embedding_service.batch_generate(limit=limit) - if result.get("processed", 0) > 0: - # Reset error count on successful batch + processed = result.get("processed", 0) + failed = result.get("failed", 0) + + if processed > 0: await _reset_error_count(session) + elif failed > 0: + # Entire batch failed — count toward circuit breaker. + # Session may be dirty; rollback before using it for config. + await _safe_handle_error( + session, + RuntimeError(f"Batch: all {failed} entries failed, 0 succeeded"), + ) return result # type: ignore[return-value] except Exception as e: logger.error(f"Failed to batch generate embeddings: {e}") - await _handle_embedding_error(session, e) - raise + await _safe_handle_error(session, e) + return {"processed": 0, "failed": 0, "error": str(e)} finally: await embedding_client.close() @@ -214,9 +252,12 @@ async def retry_failed_embeddings(ctx: dict[str, Any], limit: int = 50) -> dict[ Returns: Result dictionary with processed and failed counts """ - milvus_client: MilvusClient | None = ctx.get("milvus_client") - if not milvus_client: - return {"processed": 0, "failed": 0, "error": "Milvus unavailable"} + vector_client, vector_error = ensure_vector_client(ctx) + if not vector_client: + error = "Vector backend unavailable" + if vector_error: + error = f"{error}: {vector_error}" + return {"processed": 0, "failed": 0, "error": error} async with get_session_context() as session: # Check if vectorization is enabled @@ -229,40 +270,52 @@ async def retry_failed_embeddings(ctx: dict[str, Any], limit: int = 50) -> dict[ embedding_client = EmbeddingClient(config=settings, rate_limit=rate_limit) try: - # Ensure Milvus collections exist with correct model - await milvus_client.ensure_collections( + # Ensure vector storage exists with correct model config + await vector_client.ensure_collections( settings.dimension, settings.provider, settings.model ) embedding_service = EmbeddingService( db_session=session, embedding_client=embedding_client, - milvus_client=milvus_client, + vector_client=vector_client, ) result = await embedding_service.retry_failed(limit=limit) - if result.get("processed", 0) > 0: + processed = result.get("processed", 0) + failed = result.get("failed", 0) + + if processed > 0: await _reset_error_count(session) + elif failed > 0: + await _safe_handle_error( + session, + RuntimeError(f"Retry batch: all {failed} entries failed, 0 succeeded"), + ) return result # type: ignore[return-value] except Exception as e: logger.error(f"Failed to retry failed embeddings: {e}") - await _handle_embedding_error(session, e) - raise + await _safe_handle_error(session, e) + return {"processed": 0, "failed": 0, "error": str(e)} finally: await embedding_client.close() -async def validate_and_rebuild_embeddings(ctx: dict[str, Any]) -> dict[str, Any]: +async def validate_and_rebuild_embeddings( + ctx: dict[str, Any], force_rebuild: bool = False +) -> dict[str, Any]: """ Validate embedding config and trigger rebuild if valid. This task is triggered when vectorization is enabled or config is changed. + When force_rebuild is True (explicit user action), the compatibility check + is skipped and a full rebuild is always triggered. """ - milvus_client: MilvusClient | None = ctx.get("milvus_client") + vector_client, vector_error = ensure_vector_client(ctx) redis = ctx.get("redis") async with get_session_context() as session: @@ -286,31 +339,46 @@ async def validate_and_rebuild_embeddings(ctx: dict[str, Any]) -> dict[str, Any] ) return {"success": False, "error": provider_result.message} - # Validate Milvus - if milvus_client: - milvus_result = await validation_service.validate_milvus( + # Validate vector backend + if vector_client: + backend_result = await validation_service.validate_vector_backend( config.dimension, config.provider, config.model ) - if not milvus_result.success: + if not backend_result.success: await config_service.set_embedding_status( VectorizationStatus.ERROR.value, - error=f"Milvus validation failed: {milvus_result.message}", + error=f"Vector backend validation failed: {backend_result.message}", ) - return {"success": False, "error": milvus_result.message} + return {"success": False, "error": backend_result.message} else: + error = "Vector client not available" + if vector_error: + error = f"{error}: {vector_error}" await config_service.set_embedding_status( VectorizationStatus.ERROR.value, - error="Milvus client not available", + error=error, ) - return {"success": False, "error": "Milvus client not available"} + return {"success": False, "error": error} - # Validation passed, check if rebuild is actually needed - # If collections already exist with the same model signature, skip rebuild - is_compatible, reason = milvus_client.check_model_compatibility( - config.dimension, config.provider, config.model - ) + # Validation passed, check if rebuild is actually needed. + # Prefer backend validation result (async, backend-aware) when available, + # then fall back to client-level compatibility checks. + backend_details = backend_result.details + + details_has_compat = "is_compatible" in backend_details + details_has_exists = "collections_exist" in backend_details + + if details_has_compat and details_has_exists: + is_compatible = bool(backend_details.get("is_compatible")) + collections_exist = bool(backend_details.get("collections_exist")) + reason = backend_details.get("compatibility_reason") + else: + is_compatible, reason = vector_client.check_model_compatibility( + config.dimension, config.provider, config.model + ) + collections_exist = vector_client.collections_exist() - if is_compatible and milvus_client.collections_exist(): + if is_compatible and collections_exist and not force_rebuild: # Collections exist and are compatible - no rebuild needed logger.info( "Collections already compatible with config, skipping rebuild. " @@ -323,6 +391,12 @@ async def validate_and_rebuild_embeddings(ctx: dict[str, Any]) -> dict[str, Any] "skipped_rebuild": True, } + if force_rebuild and is_compatible and collections_exist: + logger.info( + "Force rebuild requested despite compatible collections. " + f"model={config.provider}:{config.model}, dimension={config.dimension}" + ) + # Rebuild needed: either collections don't exist or model changed logger.info( f"Rebuild required: {reason or 'collections do not exist'}. Triggering rebuild..." diff --git a/backend/apps/worker/glean_worker/tasks/feed_fetcher.py b/backend/apps/worker/glean_worker/tasks/feed_fetcher.py index 997f8ddb..6a7727bc 100644 --- a/backend/apps/worker/glean_worker/tasks/feed_fetcher.py +++ b/backend/apps/worker/glean_worker/tasks/feed_fetcher.py @@ -9,6 +9,7 @@ from arq import Retry from sqlalchemy import select +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession from glean_core import get_logger @@ -21,6 +22,11 @@ logger = get_logger(__name__) +def _is_duplicate_feed_guid_error(error: IntegrityError) -> bool: + """Check whether IntegrityError is caused by uq_feed_guid violation.""" + return "uq_feed_guid" in str(getattr(error, "orig", error)) + + async def _is_vectorization_enabled(session: AsyncSession) -> bool: """Check if vectorization is enabled and healthy.""" config_service = TypedConfigService(session) @@ -43,6 +49,12 @@ async def fetch_feed_task(ctx: dict[str, Any], feed_id: str) -> dict[str, str | Dictionary with fetch results. """ logger.info("Starting feed fetch", extra={"feed_id": feed_id}) + + # Collect entry IDs for embedding; enqueue AFTER the transaction commits + # so the embedding worker can see the newly inserted entries. + pending_embedding_ids: list[str] = [] + fetch_result_dict: dict[str, str | int] = {"status": "error", "message": "Unknown"} + async with get_session_context() as session: try: # Get feed from database @@ -110,11 +122,21 @@ async def fetch_feed_task(ctx: dict[str, Any], feed_id: str) -> dict[str, str | if cache_headers and "last-modified" in cache_headers: feed.last_modified = cache_headers["last-modified"] + # Check vectorization once for the entire batch + should_embed = bool( + ctx.get("vector_client") and await _is_vectorization_enabled(session) + ) + # Process entries new_entries = 0 latest_entry_time = feed.last_entry_at for parsed_entry in parsed_feed.entries: + if parsed_entry.published_at and ( + latest_entry_time is None or parsed_entry.published_at > latest_entry_time + ): + latest_entry_time = parsed_entry.published_at + # Check if entry already exists stmt = select(Entry).where( Entry.feed_id == feed.id, Entry.guid == parsed_entry.guid @@ -171,23 +193,28 @@ async def fetch_feed_task(ctx: dict[str, Any], feed_id: str) -> dict[str, str | summary=parsed_entry.summary, published_at=parsed_entry.published_at, ) - session.add(entry) - await session.flush() # Get entry ID + try: + # Use savepoint so one duplicate insert won't abort the whole feed fetch tx. + async with session.begin_nested(): + session.add(entry) + await session.flush() # Get entry ID + except IntegrityError as e: + if _is_duplicate_feed_guid_error(e): + logger.info( + "Skipping duplicate entry caused by concurrent fetch", + extra={ + "feed_id": feed_id, + "guid": parsed_entry.guid, + "url": parsed_entry.url, + }, + ) + continue + raise new_entries += 1 - # M3: Queue embedding task for new entry (only if vectorization enabled) - if ctx.get("milvus_client") and await _is_vectorization_enabled(session): - await ctx["redis"].enqueue_job("generate_entry_embedding", entry.id) - logger.debug( - "Queued embedding task for entry", - extra={"feed_id": feed_id, "entry_id": entry.id}, - ) - - # Track latest entry time - if parsed_entry.published_at and ( - latest_entry_time is None or parsed_entry.published_at > latest_entry_time - ): - latest_entry_time = parsed_entry.published_at + # M3: Collect entry ID for embedding (enqueued after commit) + if should_embed: + pending_embedding_ids.append(entry.id) # Update last_entry_at and schedule next fetch if latest_entry_time: @@ -205,7 +232,7 @@ async def fetch_feed_task(ctx: dict[str, Any], feed_id: str) -> dict[str, str | "total_entries": len(parsed_feed.entries), }, ) - return { + fetch_result_dict = { "status": "success", "feed_id": feed_id, "new_entries": new_entries, @@ -248,10 +275,24 @@ async def fetch_feed_task(ctx: dict[str, Any], feed_id: str) -> dict[str, str | }, ) + # Don't enqueue embedding tasks on error + pending_embedding_ids.clear() + # Retry the task logger.info("Retrying task in 5 minutes", extra={"feed_id": feed_id}) raise Retry(defer=timedelta(minutes=5)) from None + # Enqueue embedding tasks AFTER the session has committed so the worker can see the entries + for entry_id in pending_embedding_ids: + await ctx["redis"].enqueue_job("generate_entry_embedding", entry_id) + if pending_embedding_ids: + logger.debug( + "Queued embedding tasks after commit", + extra={"feed_id": feed_id, "count": len(pending_embedding_ids)}, + ) + + return fetch_result_dict + async def fetch_all_feeds(ctx: dict[str, Any]) -> dict[str, int]: """ diff --git a/backend/apps/worker/glean_worker/tasks/preference_worker.py b/backend/apps/worker/glean_worker/tasks/preference_worker.py index 11106df4..113417d7 100644 --- a/backend/apps/worker/glean_worker/tasks/preference_worker.py +++ b/backend/apps/worker/glean_worker/tasks/preference_worker.py @@ -9,9 +9,10 @@ from glean_core.schemas.config import EmbeddingConfig, VectorizationStatus from glean_core.services import TypedConfigService from glean_database.session import get_session_context -from glean_vector.clients.milvus_client import MilvusClient from glean_vector.services.preference_service import PreferenceService +from ._vector_client import ensure_vector_client + if TYPE_CHECKING: from sqlalchemy.ext.asyncio import AsyncSession @@ -85,9 +86,12 @@ async def update_user_preference( Raises: Retry: If vectorization is temporarily unavailable """ - milvus_client: MilvusClient | None = ctx.get("milvus_client") - if not milvus_client: - return {"success": False, "user_id": user_id, "error": "Milvus unavailable"} + vector_client, vector_error = ensure_vector_client(ctx) + if not vector_client: + error = "Vector backend unavailable" + if vector_error: + error = f"{error}: {vector_error}" + return {"success": False, "user_id": user_id, "error": error} # Get Redis client from worker context (provided by arq) redis_client = ctx.get("redis") @@ -102,8 +106,8 @@ async def update_user_preference( logger.debug(f"Vectorization disabled, skipping preference update for {user_id}") return {"success": False, "user_id": user_id, "error": str(e)} - # Ensure Milvus collections exist with correct model from database config - await milvus_client.ensure_collections( + # Ensure vector storage exists with correct model from database config + await vector_client.ensure_collections( config.dimension, config.provider, config.model, @@ -111,7 +115,7 @@ async def update_user_preference( preference_service = PreferenceService( db_session=session, - milvus_client=milvus_client, + vector_client=vector_client, redis_client=redis_client, ) @@ -141,9 +145,12 @@ async def rebuild_user_preference( Raises: Retry: If vectorization is temporarily unavailable """ - milvus_client: MilvusClient | None = ctx.get("milvus_client") - if not milvus_client: - return {"success": False, "user_id": user_id, "error": "Milvus unavailable"} + vector_client, vector_error = ensure_vector_client(ctx) + if not vector_client: + error = "Vector backend unavailable" + if vector_error: + error = f"{error}: {vector_error}" + return {"success": False, "user_id": user_id, "error": error} # Get Redis client from worker context (provided by arq) redis_client = ctx.get("redis") @@ -158,8 +165,8 @@ async def rebuild_user_preference( logger.debug(f"Vectorization disabled, skipping preference rebuild for {user_id}") return {"success": False, "user_id": user_id, "error": str(e)} - # Ensure Milvus collections exist with correct model from database config - await milvus_client.ensure_collections( + # Ensure vector storage exists with correct model from database config + await vector_client.ensure_collections( config.dimension, config.provider, config.model, @@ -167,7 +174,7 @@ async def rebuild_user_preference( preference_service = PreferenceService( db_session=session, - milvus_client=milvus_client, + vector_client=vector_client, redis_client=redis_client, ) diff --git a/backend/apps/worker/glean_worker/tasks/subscription_cleanup.py b/backend/apps/worker/glean_worker/tasks/subscription_cleanup.py index ce3f9588..165709f4 100644 --- a/backend/apps/worker/glean_worker/tasks/subscription_cleanup.py +++ b/backend/apps/worker/glean_worker/tasks/subscription_cleanup.py @@ -2,13 +2,15 @@ Subscription cleanup tasks. This module handles cleanup of orphan data when subscriptions are deleted, -particularly cleaning up Milvus embeddings for deleted entries. +particularly cleaning up vector embeddings for deleted entries. """ from typing import Any from glean_core import get_logger +from ._vector_client import ensure_vector_client + logger = get_logger(__name__) @@ -16,31 +18,41 @@ async def cleanup_orphan_embeddings( ctx: dict[str, Any], feed_id: str, entry_ids: list[str] ) -> dict[str, Any]: """ - Clean up Milvus embeddings for deleted entries. + Clean up vector embeddings for deleted entries. This task is called when a feed is deleted (no more subscribers). Since the entries are deleted via CASCADE from the database, - we need to manually clean up their embeddings from Milvus. + we need to manually clean up their embeddings from vector storage. Args: - ctx: Worker context with milvus_client. + ctx: Worker context with vector_client. feed_id: The deleted feed ID (for logging). entry_ids: List of entry IDs whose embeddings should be deleted. Returns: Result dict with success status and counts. """ - milvus_client = ctx.get("milvus_client") - if not milvus_client: - logger.warning("Milvus client not available, skipping embedding cleanup") - return {"success": False, "error": "Milvus unavailable", "feed_id": feed_id} + vector_client, vector_error = ensure_vector_client(ctx) + if not vector_client: + if vector_error: + logger.warning( + "Vector client not available, skipping embedding cleanup", + extra={"feed_id": feed_id, "error": vector_error}, + ) + return { + "success": False, + "error": f"Vector backend unavailable: {vector_error}", + "feed_id": feed_id, + } + logger.warning("Vector client not available, skipping embedding cleanup") + return {"success": False, "error": "Vector backend unavailable", "feed_id": feed_id} deleted_count = 0 failed_count = 0 for entry_id in entry_ids: try: - await milvus_client.delete_entry_embedding(entry_id) + await vector_client.delete_entry_embedding(entry_id) deleted_count += 1 except Exception as e: logger.warning(f"Failed to delete embedding for entry {entry_id}: {e}") diff --git a/backend/apps/worker/tests/test_preference_worker.py b/backend/apps/worker/tests/test_preference_worker.py index 1c3722ed..777ee2af 100644 --- a/backend/apps/worker/tests/test_preference_worker.py +++ b/backend/apps/worker/tests/test_preference_worker.py @@ -147,7 +147,7 @@ class TestUpdateUserPreference: async def test_validating_state_propagates_retry(self): """When vectorization is validating, Retry exception should propagate to arq.""" # Arrange - ctx = {"milvus_client": MagicMock(), "redis": MagicMock()} + ctx = {"vector_client": MagicMock(), "redis": MagicMock()} mock_config = EmbeddingConfig( enabled=True, status=VectorizationStatus.VALIDATING, @@ -177,7 +177,7 @@ async def test_validating_state_propagates_retry(self): async def test_disabled_state_returns_error_without_retry(self): """When vectorization is disabled, should return error dict without raising Retry.""" # Arrange - ctx = {"milvus_client": MagicMock(), "redis": MagicMock()} + ctx = {"vector_client": MagicMock(), "redis": MagicMock()} mock_config = EmbeddingConfig( enabled=False, status=VectorizationStatus.DISABLED, @@ -214,7 +214,7 @@ class TestRebuildUserPreference: async def test_error_state_propagates_retry(self): """When vectorization is in ERROR state, Retry exception should propagate.""" # Arrange - ctx = {"milvus_client": MagicMock(), "redis": MagicMock()} + ctx = {"vector_client": MagicMock(), "redis": MagicMock()} mock_config = EmbeddingConfig( enabled=True, status=VectorizationStatus.ERROR, @@ -243,7 +243,7 @@ async def test_error_state_propagates_retry(self): async def test_disabled_state_returns_error_without_retry(self): """When vectorization is disabled, should return error dict without raising Retry.""" # Arrange - ctx = {"milvus_client": MagicMock(), "redis": MagicMock()} + ctx = {"vector_client": MagicMock(), "redis": MagicMock()} mock_config = EmbeddingConfig( enabled=False, status=VectorizationStatus.DISABLED, diff --git a/backend/apps/worker/tests/test_vector_client_recovery.py b/backend/apps/worker/tests/test_vector_client_recovery.py new file mode 100644 index 00000000..1f5ddfc2 --- /dev/null +++ b/backend/apps/worker/tests/test_vector_client_recovery.py @@ -0,0 +1,79 @@ +"""Tests for worker vector client recovery behavior.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from glean_core.schemas.config import EmbeddingConfig, VectorizationStatus +from glean_worker.tasks._vector_client import ensure_vector_client +from glean_worker.tasks.embedding_worker import validate_and_rebuild_embeddings + + +def test_ensure_vector_client_reuses_existing_client(): + """Should reuse client already attached to worker context.""" + existing = MagicMock() + ctx = {"vector_client": existing} + + client, error = ensure_vector_client(ctx) + + assert client is existing + assert error is None + + +def test_ensure_vector_client_sets_error_when_connect_fails(): + """Should cache connection error for status reporting.""" + ctx: dict[str, object] = {} + + with patch("glean_worker.tasks._vector_client.create_vector_store_client") as create_client: + failed_client = MagicMock() + failed_client.connect.side_effect = RuntimeError("connect failed") + create_client.return_value = failed_client + + client, error = ensure_vector_client(ctx) + + assert client is None + assert error == "connect failed" + assert ctx["vector_client"] is None + assert ctx["vector_client_error"] == "connect failed" + + +@pytest.mark.asyncio +async def test_validate_and_rebuild_surfaces_vector_client_error(): + """Should include vector client init error in embedding status and response.""" + ctx = {"redis": AsyncMock()} + config = EmbeddingConfig( + enabled=True, + status=VectorizationStatus.REBUILDING, + provider="sentence-transformers", + model="all-MiniLM-L6-v2", + dimension=384, + ) + + with ( + patch( + "glean_worker.tasks.embedding_worker.ensure_vector_client", + return_value=(None, "connect failed"), + ), + patch("glean_worker.tasks.embedding_worker.get_session_context") as get_session_context, + patch("glean_worker.tasks.embedding_worker.TypedConfigService") as config_service_cls, + patch("glean_vector.services.EmbeddingValidationService") as validation_service_cls, + ): + mock_session = AsyncMock() + get_session_context.return_value.__aenter__.return_value = mock_session + + config_service = AsyncMock() + config_service.get.return_value = config + config_service_cls.return_value = config_service + + validation_service = AsyncMock() + validation_service.validate_provider.return_value = MagicMock(success=True) + validation_service_cls.return_value = validation_service + + result = await validate_and_rebuild_embeddings(ctx) + + assert result["success"] is False + assert result["error"] == "Vector client not available: connect failed" + config_service.set_embedding_status.assert_awaited_once_with( + VectorizationStatus.ERROR.value, + error="Vector client not available: connect failed", + ) diff --git a/backend/conftest.py b/backend/conftest.py index f71fd817..08e050bb 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -10,6 +10,7 @@ import pytest import pytest_asyncio from httpx import ASGITransport, AsyncClient +from sqlalchemy import text from sqlalchemy.ext.asyncio import ( AsyncEngine, AsyncSession, @@ -160,6 +161,7 @@ async def test_engine() -> AsyncGenerator[AsyncEngine, None]: # Create all tables async with engine.begin() as conn: + await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) await conn.run_sync(Base.metadata.drop_all) await conn.run_sync(Base.metadata.create_all) diff --git a/backend/packages/core/glean_core/redis_keys.py b/backend/packages/core/glean_core/redis_keys.py index a242cf2d..b6876d1c 100644 --- a/backend/packages/core/glean_core/redis_keys.py +++ b/backend/packages/core/glean_core/redis_keys.py @@ -86,6 +86,16 @@ def oidc_rate_limit(action: str, client_id: str) -> str: """ return f"oidc_rate_limit:{action}:{client_id}" + # ============================================================================ + # Embedding Rebuild Keys + # ============================================================================ + + # Distributed lock for embedding rebuild + # Prevents concurrent rebuild jobs from running + # TTL: 10 minutes (generous for the enqueue phase) + REBUILD_LOCK_KEY = "glean:rebuild_embeddings_lock" + REBUILD_LOCK_TIMEOUT = 600 + # ============================================================================ # Preference System Keys # ============================================================================ diff --git a/backend/packages/core/glean_core/schemas/config.py b/backend/packages/core/glean_core/schemas/config.py index bbd80d60..a1894083 100644 --- a/backend/packages/core/glean_core/schemas/config.py +++ b/backend/packages/core/glean_core/schemas/config.py @@ -19,7 +19,7 @@ class VectorizationStatus(str, Enum): IDLE = "idle" # Enabled, normal operation VALIDATING = "validating" # Testing provider connection REBUILDING = "rebuilding" # Full re-embedding in progress - ERROR = "error" # Provider/Milvus unavailable + ERROR = "error" # Provider/vector backend unavailable class RateLimitConfig(BaseModel): @@ -180,7 +180,7 @@ class EmbeddingConfigUpdateRequest(BaseModel): class ValidationResult(BaseModel): - """Result of provider/Milvus validation.""" + """Result of provider/vector-backend validation.""" success: bool message: str diff --git a/backend/packages/core/glean_core/services/admin_service.py b/backend/packages/core/glean_core/services/admin_service.py index bcf7ef85..2e9492d6 100644 --- a/backend/packages/core/glean_core/services/admin_service.py +++ b/backend/packages/core/glean_core/services/admin_service.py @@ -618,19 +618,29 @@ async def save_embedding_config(self, config: dict[str, Any]) -> dict[str, Any]: async def get_embedding_progress(self) -> dict[str, int]: """ - Compute embedding rebuild progress using entry counters. - """ - total_result = await self.session.execute(select(func.count()).select_from(Entry)) - total = total_result.scalar_one() - - done_query = select(func.count()).select_from(Entry).where(Entry.embedding_status == "done") - done_result = await self.session.execute(done_query) - done = done_result.scalar_one() + Compute embedding rebuild progress using entry status counters. - failed_query = ( - select(func.count()).select_from(Entry).where(Entry.embedding_status == "failed") + Uses a single ``GROUP BY`` query instead of four round-trips. + ``total`` is derived from the sum of all known embedding states so that + entries arriving *during* a rebuild (with default status) are correctly + reflected and the auto-complete condition works reliably. + """ + result = await self.session.execute( + select(Entry.embedding_status, func.count()) + .where(Entry.embedding_status.in_(["pending", "processing", "done", "failed"])) + .group_by(Entry.embedding_status) ) - failed_result = await self.session.execute(failed_query) - failed = failed_result.scalar_one() + counts: dict[str, int] = {str(row[0]): int(row[1]) for row in result.all()} - return {"total": total, "done": done, "failed": failed} + pending = counts.get("pending", 0) + processing = counts.get("processing", 0) + done = counts.get("done", 0) + failed = counts.get("failed", 0) + + return { + "total": pending + processing + done + failed, + "pending": pending, + "processing": processing, + "done": done, + "failed": failed, + } diff --git a/backend/packages/database/glean_database/migrations/versions/b51dbf4f4d2a_add_pgvector_backend_tables.py b/backend/packages/database/glean_database/migrations/versions/b51dbf4f4d2a_add_pgvector_backend_tables.py new file mode 100644 index 00000000..10beda54 --- /dev/null +++ b/backend/packages/database/glean_database/migrations/versions/b51dbf4f4d2a_add_pgvector_backend_tables.py @@ -0,0 +1,110 @@ +"""add_pgvector_backend_tables + +Revision ID: b51dbf4f4d2a +Revises: 7c6b419ed52d +Create Date: 2026-02-13 10:00:00.000000 + +""" + +import os +from collections.abc import Sequence + +import sqlalchemy as sa +from alembic import op + +try: + from pgvector.sqlalchemy import Vector +except ImportError: # pragma: no cover - migration runtime dependency + Vector = None # type: ignore[assignment,misc] + +# revision identifiers, used by Alembic. +revision: str = "b51dbf4f4d2a" +down_revision: str | None = "7c6b419ed52d" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def _is_pgvector_backend() -> bool: + """Return whether the pgvector vector backend is active. + + The pgvector schema (the ``vector`` extension plus the embedding tables) is + only provisioned when the deployment is configured to use the pgvector + backend. Milvus (the default) and any other backend skip it entirely so + those deployments are not forced to have the ``vector`` extension available + on their PostgreSQL server. This mirrors ``VectorBackendConfig`` whose + default is ``"milvus"``. + """ + return os.getenv("VECTOR_BACKEND", "milvus").strip().lower() == "pgvector" + + +def upgrade() -> None: + # Milvus / non-pgvector deployments: pgvector schema is not needed. + if not _is_pgvector_backend(): + return + + if Vector is None: + raise RuntimeError("pgvector is required to apply pgvector schema migration") + + op.execute("CREATE EXTENSION IF NOT EXISTS vector") + + op.create_table( + "entry_embeddings", + sa.Column("id", sa.String(length=36), nullable=False), + sa.Column("embedding", Vector(), nullable=False), # type: ignore[misc] + sa.Column("feed_id", sa.String(length=36), nullable=False), + sa.Column("published_at", sa.BigInteger(), nullable=False), + sa.Column("language", sa.String(length=10), nullable=False, server_default=""), + sa.Column("word_count", sa.Integer(), nullable=False, server_default="0"), + sa.Column("author", sa.String(length=200), nullable=False, server_default=""), + sa.ForeignKeyConstraint(["id"], ["entries.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_entry_embeddings_feed_id", "entry_embeddings", ["feed_id"], unique=False) + op.create_index( + "ix_entry_embeddings_published_at", + "entry_embeddings", + ["published_at"], + unique=False, + ) + # HNSW index requires fixed-dimension vector columns. We intentionally keep + # vector columns dimension-agnostic here to support model switches and handle + # optional index creation at runtime. + + op.create_table( + "user_preference_vectors", + sa.Column("id", sa.String(length=50), nullable=False), + sa.Column("user_id", sa.String(length=36), nullable=False), + sa.Column("vector_type", sa.String(length=20), nullable=False), + sa.Column("embedding", Vector(), nullable=False), # type: ignore[misc] + sa.Column("sample_count", sa.Float(), nullable=False), + sa.Column("updated_at", sa.BigInteger(), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("user_id", "vector_type", name="uq_user_vector_type"), + ) + op.create_index( + "ix_user_preference_vectors_user_id", + "user_preference_vectors", + ["user_id"], + unique=False, + ) + + op.create_table( + "vector_store_metadata", + sa.Column("name", sa.String(length=50), nullable=False), + sa.Column("model_signature", sa.String(length=255), nullable=False), + sa.Column("updated_at", sa.BigInteger(), nullable=False), + sa.PrimaryKeyConstraint("name"), + ) + + +def downgrade() -> None: + # Mirror upgrade(): only the pgvector backend ever created this schema. + # Use IF EXISTS throughout so the downgrade is a safe no-op when the tables + # were never provisioned (e.g. Milvus deployments). + if not _is_pgvector_backend(): + return + + # Dropping each table also removes its associated indexes. + op.execute("DROP TABLE IF EXISTS vector_store_metadata") + op.execute("DROP TABLE IF EXISTS user_preference_vectors") + op.execute("DROP TABLE IF EXISTS entry_embeddings") diff --git a/backend/packages/database/glean_database/models/__init__.py b/backend/packages/database/glean_database/models/__init__.py index 2cdf4132..d43b5572 100644 --- a/backend/packages/database/glean_database/models/__init__.py +++ b/backend/packages/database/glean_database/models/__init__.py @@ -9,6 +9,7 @@ from .base import Base, TimestampMixin from .bookmark import Bookmark from .entry import Entry +from .entry_embedding import EntryEmbedding from .feed import Feed, FeedStatus from .folder import Folder, FolderType from .junction import BookmarkFolder, BookmarkTag, UserEntryTag @@ -18,6 +19,8 @@ from .user_auth_provider import UserAuthProvider from .user_entry import UserEntry from .user_preference_stats import UserPreferenceStats +from .user_preference_vector import UserPreferenceVector +from .vector_store_metadata import VectorStoreMetadata __all__ = [ "Base", @@ -27,6 +30,7 @@ "Feed", "FeedStatus", "Entry", + "EntryEmbedding", "Subscription", "UserEntry", "AdminUser", @@ -42,6 +46,8 @@ "UserEntryTag", # M3 models "UserPreferenceStats", + "UserPreferenceVector", + "VectorStoreMetadata", # MCP models "APIToken", ] diff --git a/backend/packages/database/glean_database/models/entry_embedding.py b/backend/packages/database/glean_database/models/entry_embedding.py new file mode 100644 index 00000000..3413fbd1 --- /dev/null +++ b/backend/packages/database/glean_database/models/entry_embedding.py @@ -0,0 +1,29 @@ +""" +EntryEmbedding model definition. + +Stores entry vectors in PostgreSQL pgvector backend. +""" + +from pgvector.sqlalchemy import Vector +from sqlalchemy import BIGINT, ForeignKey, Integer, String +from sqlalchemy.orm import Mapped, mapped_column + +from .base import Base + + +class EntryEmbedding(Base): + """Entry embedding table for pgvector backend.""" + + __tablename__ = "entry_embeddings" + + id: Mapped[str] = mapped_column( + String(36), + ForeignKey("entries.id", ondelete="CASCADE"), + primary_key=True, + ) + embedding: Mapped[list[float]] = mapped_column(Vector()) # type: ignore[misc,valid-type] + feed_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True) + published_at: Mapped[int] = mapped_column(BIGINT, nullable=False, index=True) + language: Mapped[str] = mapped_column(String(10), nullable=False, default="") + word_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + author: Mapped[str] = mapped_column(String(200), nullable=False, default="") diff --git a/backend/packages/database/glean_database/models/user_preference_vector.py b/backend/packages/database/glean_database/models/user_preference_vector.py new file mode 100644 index 00000000..a72f6b41 --- /dev/null +++ b/backend/packages/database/glean_database/models/user_preference_vector.py @@ -0,0 +1,26 @@ +""" +UserPreferenceVector model definition. + +Stores user preference vectors in PostgreSQL pgvector backend. +""" + +from pgvector.sqlalchemy import Vector +from sqlalchemy import BIGINT, String, UniqueConstraint +from sqlalchemy.orm import Mapped, mapped_column + +from .base import Base + + +class UserPreferenceVector(Base): + """User preference vectors for pgvector backend.""" + + __tablename__ = "user_preference_vectors" + + id: Mapped[str] = mapped_column(String(50), primary_key=True) + user_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True) + vector_type: Mapped[str] = mapped_column(String(20), nullable=False) + embedding: Mapped[list[float]] = mapped_column(Vector()) # type: ignore[misc,valid-type] + sample_count: Mapped[float] = mapped_column(nullable=False) + updated_at: Mapped[int] = mapped_column(BIGINT, nullable=False) + + __table_args__ = (UniqueConstraint("user_id", "vector_type", name="uq_user_vector_type"),) diff --git a/backend/packages/database/glean_database/models/vector_store_metadata.py b/backend/packages/database/glean_database/models/vector_store_metadata.py new file mode 100644 index 00000000..aeb01a06 --- /dev/null +++ b/backend/packages/database/glean_database/models/vector_store_metadata.py @@ -0,0 +1,20 @@ +""" +VectorStoreMetadata model definition. + +Stores backend metadata like active model signature for vector tables. +""" + +from sqlalchemy import BIGINT, String +from sqlalchemy.orm import Mapped, mapped_column + +from .base import Base + + +class VectorStoreMetadata(Base): + """Metadata table for vector backend compatibility checks.""" + + __tablename__ = "vector_store_metadata" + + name: Mapped[str] = mapped_column(String(50), primary_key=True) + model_signature: Mapped[str] = mapped_column(String(255), nullable=False) + updated_at: Mapped[int] = mapped_column(BIGINT, nullable=False) diff --git a/backend/packages/database/pyproject.toml b/backend/packages/database/pyproject.toml index 8b3924b9..9b8b887c 100644 --- a/backend/packages/database/pyproject.toml +++ b/backend/packages/database/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.11" dependencies = [ "sqlalchemy[asyncio]>=2.0.0", "asyncpg>=0.29.0", + "pgvector>=0.3.6", "alembic>=1.13.0", "greenlet>=3.0.0", ] diff --git a/backend/packages/vector/glean_vector/clients/__init__.py b/backend/packages/vector/glean_vector/clients/__init__.py index a89600ab..b94fdab4 100644 --- a/backend/packages/vector/glean_vector/clients/__init__.py +++ b/backend/packages/vector/glean_vector/clients/__init__.py @@ -2,5 +2,13 @@ from glean_vector.clients.embedding_client import EmbeddingClient from glean_vector.clients.milvus_client import MilvusClient +from glean_vector.clients.pgvector_client import PgVectorClient +from glean_vector.clients.vector_store import VectorStoreClient, create_vector_store_client -__all__ = ["EmbeddingClient", "MilvusClient"] +__all__ = [ + "EmbeddingClient", + "MilvusClient", + "PgVectorClient", + "VectorStoreClient", + "create_vector_store_client", +] diff --git a/backend/packages/vector/glean_vector/clients/milvus_client.py b/backend/packages/vector/glean_vector/clients/milvus_client.py index 965ed770..f7f33ef1 100644 --- a/backend/packages/vector/glean_vector/clients/milvus_client.py +++ b/backend/packages/vector/glean_vector/clients/milvus_client.py @@ -2,7 +2,7 @@ import asyncio from contextlib import suppress -from datetime import datetime +from datetime import UTC, datetime from typing import Any from pymilvus import ( @@ -65,7 +65,7 @@ def _build_model_signature(provider: str, model: str, dimension: int) -> str: return f"{provider}:{model}:{dimension}" @staticmethod - def _extract_model_signature(collection: Collection) -> str | None: + def extract_model_signature(collection: Collection) -> str | None: """ Extract model signature from collection description. @@ -121,8 +121,8 @@ def check_model_compatibility( try: if utility.has_collection(self.config.entries_collection): # type: ignore[truthy-function] collection = Collection(self.config.entries_collection) - existing_signature = self._extract_model_signature(collection) - if existing_signature and existing_signature != expected_signature: + existing_signature = self.extract_model_signature(collection) + if existing_signature != expected_signature: return ( False, f"Entries collection signature mismatch: " @@ -138,8 +138,8 @@ def check_model_compatibility( try: if utility.has_collection(self.config.prefs_collection): # type: ignore[truthy-function] collection = Collection(self.config.prefs_collection) - existing_signature = self._extract_model_signature(collection) - if existing_signature and existing_signature != expected_signature: + existing_signature = self.extract_model_signature(collection) + if existing_signature != expected_signature: return ( False, f"Preferences collection signature mismatch: " @@ -220,8 +220,8 @@ async def ensure_collections( # Check if model has changed if expected_signature: - existing_signature = self._extract_model_signature(collection) - if existing_signature and existing_signature != expected_signature: + existing_signature = self.extract_model_signature(collection) + if existing_signature != expected_signature: # Model changed - recreate collections await self.recreate_collections(dimension, provider, model) return @@ -237,8 +237,8 @@ async def ensure_collections( # Check if model has changed if expected_signature: - existing_signature = self._extract_model_signature(collection) - if existing_signature and existing_signature != expected_signature: + existing_signature = self.extract_model_signature(collection) + if existing_signature != expected_signature: # Model changed - recreate collections await self.recreate_collections(dimension, provider, model) return @@ -315,7 +315,7 @@ def _create_entries_collection( if utility.has_collection(self.config.entries_collection): # type: ignore[truthy-function] existing_collection = Collection(self.config.entries_collection) if provider and model: - existing_signature = self._extract_model_signature(existing_collection) + existing_signature = self.extract_model_signature(existing_collection) expected_signature = self._build_model_signature(provider, model, dimension) if existing_signature == expected_signature: existing_collection.load() # type: ignore[unused-coroutine] @@ -387,7 +387,7 @@ def _create_user_preferences_collection( if utility.has_collection(self.config.prefs_collection): # type: ignore[truthy-function] existing_collection = Collection(self.config.prefs_collection) if provider and model: - existing_signature = self._extract_model_signature(existing_collection) + existing_signature = self.extract_model_signature(existing_collection) expected_signature = self._build_model_signature(provider, model, dimension) if existing_signature == expected_signature: existing_collection.load() # type: ignore[unused-coroutine] @@ -502,7 +502,7 @@ async def insert_entry_embedding( raise RuntimeError("Collections not initialized. Call ensure_collections() first.") published_ts = ( - int(published_at.timestamp()) if published_at else int(datetime.now().timestamp()) + int(published_at.timestamp()) if published_at else int(datetime.now(UTC).timestamp()) ) # Delete existing entry if present (upsert pattern) diff --git a/backend/packages/vector/glean_vector/clients/pgvector_client.py b/backend/packages/vector/glean_vector/clients/pgvector_client.py new file mode 100644 index 00000000..46a10580 --- /dev/null +++ b/backend/packages/vector/glean_vector/clients/pgvector_client.py @@ -0,0 +1,443 @@ +"""pgvector client for vector operations.""" + +from __future__ import annotations + +import os +from datetime import UTC, datetime +from typing import Any + +from sqlalchemy import BIGINT, Column, Float, Integer, MetaData, String, Table, and_, delete, select +from sqlalchemy.dialects.postgresql import insert +from sqlalchemy.ext.asyncio import ( + AsyncEngine, + AsyncSession, + async_sessionmaker, + create_async_engine, +) + +from glean_core import get_logger +from glean_vector.config import pgvector_config + +try: + from pgvector.sqlalchemy import Vector +except ImportError: # pragma: no cover + Vector = None # type: ignore[assignment,misc] + +logger = get_logger(__name__) + + +def _quote_ident(name: str) -> str: + """Return a properly double-quoted PostgreSQL identifier.""" + return '"' + name.replace('"', '""') + '"' + + +class PgVectorClient: + """pgvector-backed vector store client.""" + + def __init__(self) -> None: + self.config = pgvector_config + self._connected = False + self._engine: AsyncEngine | None = None + self._session_maker: async_sessionmaker[AsyncSession] | None = None + self._metadata = MetaData() + self._entries_table: Table | None = None + self._prefs_table: Table | None = None + self._meta_table: Table | None = None + # Caching flags to avoid repeated DDL / metadata queries + self._schema_ensured = False + self._last_model_signature: str | None = None + + @staticmethod + def _build_model_signature(provider: str, model: str, dimension: int) -> str: + return f"{provider}:{model}:{dimension}" + + @property + def _database_url(self) -> str: + if self.config.database_url: + return self.config.database_url + return os.getenv("DATABASE_URL", "") + + def _ensure_connected(self) -> None: + if not self._connected: + self.connect() + + def connect(self) -> None: + """Create SQLAlchemy engine/session factory for pgvector operations.""" + if self._connected and self._engine and self._session_maker: + return + if Vector is None: + raise ConnectionError("pgvector package is not installed") + database_url = self._database_url + if not database_url: + raise ConnectionError("PGVECTOR_DATABASE_URL or DATABASE_URL is required") + + self._engine = create_async_engine(database_url, echo=False) + self._session_maker = async_sessionmaker(self._engine, expire_on_commit=False) + self._init_tables() + self._connected = True + + def _init_tables(self) -> None: + if Vector is None: + raise RuntimeError("pgvector package is not installed") + self._metadata = MetaData() + self._entries_table = Table( + self.config.entries_table, + self._metadata, + Column("id", String(36), primary_key=True), + Column("embedding", Vector(), nullable=False), # type: ignore[misc,operator] + Column("feed_id", String(36), nullable=False), + Column("published_at", BIGINT, nullable=False), + Column("language", String(10), nullable=False, server_default=""), + Column("word_count", Integer, nullable=False, server_default="0"), + Column("author", String(200), nullable=False, server_default=""), + ) + self._prefs_table = Table( + self.config.prefs_table, + self._metadata, + Column("id", String(50), primary_key=True), + Column("user_id", String(36), nullable=False), + Column("vector_type", String(20), nullable=False), + Column("embedding", Vector(), nullable=False), # type: ignore[misc,operator] + Column("sample_count", Float, nullable=False), + Column("updated_at", BIGINT, nullable=False), + ) + self._meta_table = Table( + self.config.metadata_table, + self._metadata, + Column("name", String(50), primary_key=True), + Column("model_signature", String(255), nullable=False), + Column("updated_at", BIGINT, nullable=False), + ) + + def disconnect(self) -> None: + """Close connection resources.""" + if self._engine is not None: + self._engine.sync_engine.dispose() + self._connected = False + self._engine = None + self._session_maker = None + + def collections_exist(self) -> bool: + """Whether required vector tables have been verified to exist. + + Returns the cached result from the most recent ``ensure_collections`` + or ``recreate_collections`` call. Before either has been called the + answer is conservatively ``False``. + """ + return self._schema_ensured + + def check_model_compatibility( + self, dimension: int, provider: str, model: str + ) -> tuple[bool, str | None]: + """Check whether existing data is compatible with the given model config. + + Uses the model signature cached by ``ensure_collections``. + """ + if not self._schema_ensured: + return (False, "Collections not yet verified") + expected = self._build_model_signature(provider, model, dimension) + if self._last_model_signature == expected: + return (True, None) + return ( + False, + f"Model signature mismatch: current={self._last_model_signature}, expected={expected}", + ) + + async def _execute(self, statement: Any) -> Any: + if self._session_maker is None: + raise RuntimeError("pgvector client not connected") + async with self._session_maker() as session: + result = await session.execute(statement) + await session.commit() + return result + + async def _load_model_signature(self) -> str | None: + if self._session_maker is None: + raise RuntimeError("pgvector client not connected") + _, _, meta_table = self._tables() + async with self._session_maker() as session: + rows = ( + await session.execute( + select(meta_table.c.name, meta_table.c.model_signature).where( + meta_table.c.name.in_(["entries", "preferences"]) + ) + ) + ).all() + signatures = {str(row.name): str(row.model_signature) for row in rows} + entries_signature = signatures.get("entries") + prefs_signature = signatures.get("preferences") + if entries_signature and entries_signature == prefs_signature: + return entries_signature + return None + + async def _write_model_metadata(self, signature: str) -> None: + _, _, meta_table = self._tables() + now_ts = int(datetime.now(UTC).timestamp()) + await self._execute( + insert(meta_table) + .values(name="entries", model_signature=signature, updated_at=now_ts) + .on_conflict_do_update( + index_elements=["name"], + set_={"model_signature": signature, "updated_at": now_ts}, + ) + ) + await self._execute( + insert(meta_table) + .values(name="preferences", model_signature=signature, updated_at=now_ts) + .on_conflict_do_update( + index_elements=["name"], + set_={"model_signature": signature, "updated_at": now_ts}, + ) + ) + self._last_model_signature = signature + + def _tables(self) -> tuple[Table, Table, Table]: + if self._entries_table is None or self._prefs_table is None or self._meta_table is None: + raise RuntimeError("pgvector tables not initialized") + return self._entries_table, self._prefs_table, self._meta_table + + async def ensure_collections( + self, dimension: int, provider: str | None = None, model: str | None = None + ) -> None: + """Ensure pgvector extension/tables/indexes exist. + + Results are cached so that repeated calls within the same client + lifetime skip the DDL checks. The cache is invalidated by + ``recreate_collections``. + """ + self._ensure_connected() + if self._engine is None: + raise RuntimeError("pgvector engine unavailable") + entries_table, prefs_table, _meta_table = self._tables() + + if not self._schema_ensured: + async with self._engine.begin() as conn: + await conn.exec_driver_sql("CREATE EXTENSION IF NOT EXISTS vector") + await conn.run_sync(self._metadata.create_all) + await conn.exec_driver_sql( + f"CREATE INDEX IF NOT EXISTS {_quote_ident('idx_' + entries_table.name + '_feed_id')} " + f"ON {_quote_ident(entries_table.name)} (feed_id)" + ) + await conn.exec_driver_sql( + f"CREATE INDEX IF NOT EXISTS {_quote_ident('idx_' + entries_table.name + '_published_at')} " + f"ON {_quote_ident(entries_table.name)} (published_at)" + ) + await conn.exec_driver_sql( + f"CREATE UNIQUE INDEX IF NOT EXISTS {_quote_ident('idx_' + prefs_table.name + '_user_type')} " + f"ON {_quote_ident(prefs_table.name)} (user_id, vector_type)" + ) + self._schema_ensured = True + + if provider and model and self._last_model_signature is None: + self._last_model_signature = await self._load_model_signature() + + async def recreate_collections( + self, dimension: int, provider: str | None = None, model: str | None = None + ) -> None: + """Clear vector data and re-stamp model metadata for a full rebuild. + + The pgvector tables are owned by Alembic and use a dimension-agnostic + ``Vector`` column, so switching embedding model never requires a schema + change. We therefore ``TRUNCATE`` the data instead of ``DROP``/``CREATE`` + so that foreign keys (e.g. ``entry_embeddings.id`` -> ``entries.id`` + ``ON DELETE CASCADE``) and indexes survive a rebuild and the live schema + does not drift from the migration history. + """ + self._ensure_connected() + if self._engine is None: + raise RuntimeError("pgvector engine unavailable") + # Reset the signature cache; ensure_collections recreates any missing + # tables/indexes for deployments not managed by Alembic migrations. + self._last_model_signature = None + await self.ensure_collections(dimension) + entries_table, prefs_table, meta_table = self._tables() + async with self._engine.begin() as conn: + await conn.exec_driver_sql( + f"TRUNCATE TABLE {_quote_ident(entries_table.name)}, " + f"{_quote_ident(prefs_table.name)}, {_quote_ident(meta_table.name)}" + ) + if provider and model: + signature = self._build_model_signature(provider, model, dimension) + await self._write_model_metadata(signature) + + async def insert_entry_embedding( + self, + entry_id: str, + embedding: list[float], + feed_id: str, + published_at: datetime | None = None, + language: str = "", + word_count: int = 0, + author: str = "", + ) -> None: + self._ensure_connected() + entries_table, _, _ = self._tables() + published_ts = ( + int(published_at.timestamp()) if published_at else int(datetime.now(UTC).timestamp()) + ) + stmt = ( + insert(entries_table) + .values( + id=entry_id, + embedding=embedding, + feed_id=feed_id, + published_at=published_ts, + language=language or "", + word_count=word_count, + author=author or "", + ) + .on_conflict_do_update( + index_elements=["id"], + set_={ + "embedding": embedding, + "feed_id": feed_id, + "published_at": published_ts, + "language": language or "", + "word_count": word_count, + "author": author or "", + }, + ) + ) + await self._execute(stmt) + + async def get_entry_embedding(self, entry_id: str) -> list[float] | None: + self._ensure_connected() + if self._session_maker is None: + raise RuntimeError("pgvector client not connected") + entries_table, _, _ = self._tables() + async with self._session_maker() as session: + row = ( + await session.execute( + select(entries_table.c.embedding).where(entries_table.c.id == entry_id) + ) + ).first() + if not row: + return None + return list(row[0]) if row[0] is not None else None + + async def batch_get_entry_embeddings(self, entry_ids: list[str]) -> dict[str, list[float]]: + self._ensure_connected() + if not entry_ids: + return {} + if self._session_maker is None: + raise RuntimeError("pgvector client not connected") + entries_table, _, _ = self._tables() + async with self._session_maker() as session: + rows = ( + await session.execute( + select(entries_table.c.id, entries_table.c.embedding).where( + entries_table.c.id.in_(entry_ids) + ) + ) + ).all() + return {row[0]: list(row[1]) for row in rows} + + async def search_similar_entries( + self, + query_vector: list[float], + top_k: int = 10, + filters: dict[str, Any] | None = None, + ) -> list[dict[str, Any]]: + self._ensure_connected() + if self._session_maker is None: + raise RuntimeError("pgvector client not connected") + entries_table, _, _ = self._tables() + distance = entries_table.c.embedding.cosine_distance(query_vector) # type: ignore[union-attr] + stmt = select( + entries_table.c.id, + entries_table.c.feed_id, + entries_table.c.published_at, + entries_table.c.author, + distance.label("distance"), + ) + if filters: + clauses = [] + if "feed_id" in filters: + clauses.append(entries_table.c.feed_id == filters["feed_id"]) + if "min_published_at" in filters: + min_ts = int(filters["min_published_at"].timestamp()) + clauses.append(entries_table.c.published_at >= min_ts) + if clauses: + stmt = stmt.where(and_(*clauses)) + stmt = stmt.order_by(distance).limit(top_k) + async with self._session_maker() as session: + rows = (await session.execute(stmt)).all() + return [ + { + "id": row.id, + "score": 1.0 - float(row.distance), + "feed_id": row.feed_id, + "published_at": row.published_at, + "author": row.author, + } + for row in rows + ] + + async def upsert_user_preference( + self, + user_id: str, + vector_type: str, + embedding: list[float], + sample_count: float, + updated_at: int, + ) -> None: + self._ensure_connected() + _, prefs_table, _ = self._tables() + pref_id = f"{user_id}_{vector_type}" + stmt = ( + insert(prefs_table) + .values( + id=pref_id, + user_id=user_id, + vector_type=vector_type, + embedding=embedding, + sample_count=sample_count, + updated_at=updated_at, + ) + .on_conflict_do_update( + index_elements=["id"], + set_={ + "user_id": user_id, + "vector_type": vector_type, + "embedding": embedding, + "sample_count": sample_count, + "updated_at": updated_at, + }, + ) + ) + await self._execute(stmt) + + async def get_user_preferences(self, user_id: str) -> dict[str, dict[str, Any]]: + self._ensure_connected() + if self._session_maker is None: + raise RuntimeError("pgvector client not connected") + _, prefs_table, _ = self._tables() + async with self._session_maker() as session: + rows = ( + await session.execute( + select( + prefs_table.c.vector_type, + prefs_table.c.embedding, + prefs_table.c.sample_count, + prefs_table.c.updated_at, + ).where(prefs_table.c.user_id == user_id) + ) + ).all() + return { + row.vector_type: { + "embedding": list(row.embedding), + "sample_count": row.sample_count, + "updated_at": row.updated_at, + } + for row in rows + } + + async def delete_entry_embedding(self, entry_id: str) -> None: + self._ensure_connected() + entries_table, _, _ = self._tables() + await self._execute(delete(entries_table).where(entries_table.c.id == entry_id)) + + async def delete_user_preferences(self, user_id: str) -> None: + self._ensure_connected() + _, prefs_table, _ = self._tables() + await self._execute(delete(prefs_table).where(prefs_table.c.user_id == user_id)) diff --git a/backend/packages/vector/glean_vector/clients/vector_store.py b/backend/packages/vector/glean_vector/clients/vector_store.py new file mode 100644 index 00000000..7167635f --- /dev/null +++ b/backend/packages/vector/glean_vector/clients/vector_store.py @@ -0,0 +1,99 @@ +"""Vector store client abstractions and factory.""" + +from __future__ import annotations + +from typing import Any, Protocol + +from glean_vector.config import vector_backend_config + + +class VectorStoreClient(Protocol): + """Backend-agnostic vector store interface.""" + + def connect(self) -> None: + """Establish backend connection or initialize resources.""" + + def disconnect(self) -> None: + """Close backend connection/resources.""" + + async def ensure_collections( + self, dimension: int, provider: str | None = None, model: str | None = None + ) -> None: + """Ensure storage structures exist and match active model config.""" + + async def recreate_collections( + self, dimension: int, provider: str | None = None, model: str | None = None + ) -> None: + """Drop and recreate storage structures for full rebuild.""" + + def check_model_compatibility( + self, dimension: int, provider: str, model: str + ) -> tuple[bool, str | None]: + """Check whether existing vector data is compatible with model config.""" + ... + + def collections_exist(self) -> bool: + """Return whether required vector storage structures already exist.""" + ... + + async def insert_entry_embedding( + self, + entry_id: str, + embedding: list[float], + feed_id: str, + published_at: Any | None = None, + language: str = "", + word_count: int = 0, + author: str = "", + ) -> None: + """Insert or update an entry embedding.""" + + async def get_entry_embedding(self, entry_id: str) -> list[float] | None: + """Get embedding for one entry.""" + + async def batch_get_entry_embeddings(self, entry_ids: list[str]) -> dict[str, list[float]]: + """Get embeddings for multiple entries.""" + ... + + async def search_similar_entries( + self, + query_vector: list[float], + top_k: int = 10, + filters: dict[str, Any] | None = None, + ) -> list[dict[str, Any]]: + """Search similar entries with optional filters.""" + ... + + async def upsert_user_preference( + self, + user_id: str, + vector_type: str, + embedding: list[float], + sample_count: float, + updated_at: int, + ) -> None: + """Insert or update a user preference vector.""" + + async def get_user_preferences(self, user_id: str) -> dict[str, dict[str, Any]]: + """Get all preference vectors for a user.""" + ... + + async def delete_entry_embedding(self, entry_id: str) -> None: + """Delete one entry embedding.""" + + async def delete_user_preferences(self, user_id: str) -> None: + """Delete all preference vectors for a user.""" + + +def create_vector_store_client() -> VectorStoreClient: + """Create vector store client from configured backend.""" + backend = vector_backend_config.backend.lower() + if backend == "milvus": + from glean_vector.clients.milvus_client import MilvusClient + + return MilvusClient() + if backend == "pgvector": + from glean_vector.clients.pgvector_client import PgVectorClient + + return PgVectorClient() + raise ValueError(f"Unsupported vector backend: {vector_backend_config.backend}") diff --git a/backend/packages/vector/glean_vector/config.py b/backend/packages/vector/glean_vector/config.py index a55f0757..0f8b7013 100644 --- a/backend/packages/vector/glean_vector/config.py +++ b/backend/packages/vector/glean_vector/config.py @@ -1,8 +1,9 @@ """Configuration for vector services.""" from pathlib import Path -from typing import Any +from typing import Any, Literal +from pydantic import field_validator from pydantic_settings import BaseSettings, SettingsConfigDict # Find .env file in project root @@ -49,6 +50,43 @@ class MilvusConfig(BaseSettings): prefs_collection: str = "user_preferences" +class VectorBackendConfig(BaseSettings): + """Vector backend selector configuration.""" + + model_config = SettingsConfigDict( + env_prefix="VECTOR_", + env_file=str(_env_file) if _env_file.exists() else None, + env_file_encoding="utf-8", + extra="ignore", + ) + + backend: Literal["milvus", "pgvector"] = "milvus" + + @field_validator("backend", mode="before") + @classmethod + def validate_backend(cls, value: str) -> str: + backend = str(value).lower() + if backend not in {"milvus", "pgvector"}: + raise ValueError("VECTOR_BACKEND must be either 'milvus' or 'pgvector'") + return backend + + +class PgVectorConfig(BaseSettings): + """pgvector backend configuration.""" + + model_config = SettingsConfigDict( + env_prefix="PGVECTOR_", + env_file=str(_env_file) if _env_file.exists() else None, + env_file_encoding="utf-8", + extra="ignore", + ) + + database_url: str = "" + entries_table: str = "entry_embeddings" + prefs_table: str = "user_preference_vectors" + metadata_table: str = "vector_store_metadata" + + class PreferenceConfig(BaseSettings): """Preference calculation configuration.""" @@ -85,6 +123,8 @@ class ScoreConfig(BaseSettings): # Global config instances embedding_config = EmbeddingConfig() milvus_config = MilvusConfig() +vector_backend_config = VectorBackendConfig() +pgvector_config = PgVectorConfig() preference_config = PreferenceConfig() score_config = ScoreConfig() diff --git a/backend/packages/vector/glean_vector/services/embedding_service.py b/backend/packages/vector/glean_vector/services/embedding_service.py index d53a5169..e8a64778 100644 --- a/backend/packages/vector/glean_vector/services/embedding_service.py +++ b/backend/packages/vector/glean_vector/services/embedding_service.py @@ -1,7 +1,8 @@ """Embedding generation service.""" +import contextlib import re -from datetime import datetime +from datetime import UTC, datetime from sqlalchemy import select, update from sqlalchemy.ext.asyncio import AsyncSession @@ -9,7 +10,7 @@ from glean_core import get_logger from glean_database.models import Entry from glean_vector.clients.embedding_client import EmbeddingClient -from glean_vector.clients.milvus_client import MilvusClient +from glean_vector.clients.vector_store import VectorStoreClient logger = get_logger(__name__) @@ -21,7 +22,7 @@ class EmbeddingService: Handles the complete embedding lifecycle: 1. Extract text from entry 2. Generate embedding via API - 3. Store in Milvus + 3. Store in vector backend 4. Update entry status in PostgreSQL """ @@ -29,7 +30,7 @@ def __init__( self, db_session: AsyncSession, embedding_client: EmbeddingClient, - milvus_client: MilvusClient, + vector_client: VectorStoreClient, ) -> None: """ Initialize embedding service. @@ -37,11 +38,11 @@ def __init__( Args: db_session: Database session embedding_client: Embedding API client - milvus_client: Milvus vector database client + vector_client: Vector database client """ self.db = db_session self.embedding_client = embedding_client - self.milvus = milvus_client + self.vector_client = vector_client def _extract_text(self, entry: Entry) -> str: """ @@ -105,6 +106,10 @@ async def generate_embedding(self, entry_id: str) -> bool: entry = result.scalar_one_or_none() if not entry: + logger.warning( + "Entry not found for embedding generation", + extra={"entry_id": entry_id}, + ) return False # Skip if already processed @@ -119,23 +124,23 @@ async def generate_embedding(self, entry_id: str) -> bool: ) await self.db.flush() - try: - # Extract text - text = self._extract_text(entry) - word_count = self._calculate_word_count(text) + # Extract text (content-level issue → return False, no exception) + text = self._extract_text(entry) + word_count = self._calculate_word_count(text) - if not text: - await self._mark_failed(entry_id, "No text content to embed") - return False + if not text: + await self._mark_failed(entry_id, "No text content to embed") + return False - # Generate embedding + try: + # Generate embedding via provider API embedding, _ = await self.embedding_client.generate_embedding(text) # Detect language (simple heuristic) language = self._detect_language(text) - # Store in Milvus - await self.milvus.insert_entry_embedding( + # Store in vector backend + await self.vector_client.insert_entry_embedding( entry_id=entry_id, embedding=embedding, feed_id=entry.feed_id, @@ -151,7 +156,7 @@ async def generate_embedding(self, entry_id: str) -> bool: .where(Entry.id == entry_id) .values( embedding_status="done", - embedding_at=datetime.now(), + embedding_at=datetime.now(UTC), word_count=word_count, embedding_error=None, ) @@ -161,9 +166,11 @@ async def generate_embedding(self, entry_id: str) -> bool: return True except Exception as e: + # Infrastructure error (API / vector backend) – mark the entry + # as failed but re-raise so the worker circuit breaker can react. logger.error(f"Failed to generate embedding for entry {entry_id}: {e}") - await self._mark_failed(entry_id, str(e)) - return False + await self._mark_failed_safe(entry_id, str(e)) + raise async def _mark_failed(self, entry_id: str, error: str) -> None: """Mark entry embedding as failed.""" @@ -174,6 +181,30 @@ async def _mark_failed(self, entry_id: str, error: str) -> None: ) await self.db.flush() + async def _mark_failed_safe(self, entry_id: str, error: str) -> None: + """Mark entry as failed, recovering from a poisoned session if needed. + + When a prior flush/execute fails, asyncpg puts the connection into a + failed-transaction state where all subsequent SQL is rejected. This + helper rolls back the aborted transaction before writing the UPDATE so + that the failure is recorded even after an infrastructure error. + + Side-effect: a rollback discards any unflushed changes on the session + (the caller should commit per-entry to minimise data loss). + """ + try: + await self.db.rollback() + await self._mark_failed(entry_id, error) + except Exception: + try: + await self.db.rollback() + await self._mark_failed(entry_id, error) + except Exception: + logger.warning( + "Could not mark entry as failed after session recovery", + extra={"entry_id": entry_id}, + ) + def _detect_language(self, text: str) -> str: """ Simple language detection. @@ -201,29 +232,62 @@ async def batch_generate(self, limit: int = 100) -> dict[str, int]: """ Generate embeddings for pending entries in batch. + Uses SELECT FOR UPDATE SKIP LOCKED so that concurrent workers each + claim disjoint sets of entries and no entry is processed twice (or + left permanently pending because every job grabbed the same rows). + + Each entry is committed independently so that a single failure + doesn't poison the session for remaining entries. + Args: limit: Maximum number of entries to process Returns: Dictionary with processed and failed counts """ - # Get pending entries - result = await self.db.execute( - select(Entry) + # Atomically claim pending entries. Rows locked by another concurrent + # worker are skipped, ensuring each worker gets a unique slice. + claim_result = await self.db.execute( + select(Entry.id) .where(Entry.embedding_status == "pending") .order_by(Entry.created_at.desc()) .limit(limit) + .with_for_update(skip_locked=True) + ) + entry_ids = [row[0] for row in claim_result.all()] + + if not entry_ids: + return {"processed": 0, "failed": 0} + + # Mark all claimed entries as processing and commit to make the + # claim durable. This also releases the FOR UPDATE locks; since + # the status is now 'processing', no other worker will pick them up. + await self.db.execute( + update(Entry) + .where(Entry.id.in_(entry_ids)) + .values(embedding_status="processing", embedding_error=None) ) - entries = result.scalars().all() + await self.db.commit() processed = 0 failed = 0 - for entry in entries: - success = await self.generate_embedding(entry.id) - if success: - processed += 1 - else: + for entry_id in entry_ids: + try: + success = await self.generate_embedding(entry_id) + await self.db.commit() + if success: + processed += 1 + else: + failed += 1 + except Exception: + # Infrastructure error. generate_embedding already tried + # _mark_failed_safe (which may have rolled back). Ensure + # any pending changes are either committed or rolled back + # so the next iteration starts with a clean session. + with contextlib.suppress(Exception): + await self.db.rollback() + await self._try_mark_failed_and_commit(entry_id, "Infrastructure error") failed += 1 return {"processed": processed, "failed": failed} @@ -232,33 +296,62 @@ async def retry_failed(self, limit: int = 50) -> dict[str, int]: """ Retry failed embeddings. + Uses SELECT FOR UPDATE SKIP LOCKED to avoid concurrent retriers + picking up the same entries. Each entry is committed independently. + Args: limit: Maximum number of entries to retry Returns: Dictionary with processed and failed counts """ - # Get failed entries (not recently failed) - result = await self.db.execute( - select(Entry) + claim_result = await self.db.execute( + select(Entry.id) .where(Entry.embedding_status == "failed") .order_by(Entry.updated_at.asc()) .limit(limit) + .with_for_update(skip_locked=True) + ) + entry_ids = [row[0] for row in claim_result.all()] + + if not entry_ids: + return {"processed": 0, "failed": 0} + + await self.db.execute( + update(Entry) + .where(Entry.id.in_(entry_ids)) + .values(embedding_status="processing", embedding_error=None) ) - entries = result.scalars().all() + await self.db.commit() processed = 0 failed = 0 - for entry in entries: - success = await self.generate_embedding(entry.id) - if success: - processed += 1 - else: + for entry_id in entry_ids: + try: + success = await self.generate_embedding(entry_id) + await self.db.commit() + if success: + processed += 1 + else: + failed += 1 + except Exception: + with contextlib.suppress(Exception): + await self.db.rollback() + await self._try_mark_failed_and_commit(entry_id, "Infrastructure error (retry)") failed += 1 return {"processed": processed, "failed": failed} + async def _try_mark_failed_and_commit(self, entry_id: str, error: str) -> None: + """Best-effort: mark entry failed and commit in a fresh transaction.""" + try: + await self._mark_failed(entry_id, error) + await self.db.commit() + except Exception: + with contextlib.suppress(Exception): + await self.db.rollback() + async def delete_embedding(self, entry_id: str) -> None: """ Delete embedding for an entry. @@ -266,8 +359,8 @@ async def delete_embedding(self, entry_id: str) -> None: Args: entry_id: Entry UUID """ - # Delete from Milvus - await self.milvus.delete_entry_embedding(entry_id) + # Delete from vector backend + await self.vector_client.delete_entry_embedding(entry_id) # Update entry status await self.db.execute( diff --git a/backend/packages/vector/glean_vector/services/preference_service.py b/backend/packages/vector/glean_vector/services/preference_service.py index 97b436d7..5832f960 100644 --- a/backend/packages/vector/glean_vector/services/preference_service.py +++ b/backend/packages/vector/glean_vector/services/preference_service.py @@ -1,7 +1,7 @@ """User preference model service.""" import contextlib -from datetime import datetime +from datetime import UTC, datetime import numpy as np from redis.asyncio import Redis @@ -10,7 +10,7 @@ from glean_core import RedisKeys from glean_database.models import Entry, UserEntry, UserPreferenceStats -from glean_vector.clients.milvus_client import MilvusClient +from glean_vector.clients.vector_store import VectorStoreClient from glean_vector.config import preference_config @@ -33,7 +33,7 @@ class PreferenceService: def __init__( self, db_session: AsyncSession, - milvus_client: MilvusClient, + vector_client: VectorStoreClient, redis_client: Redis | None = None, ) -> None: """ @@ -41,11 +41,11 @@ def __init__( Args: db_session: Database session - milvus_client: Milvus vector database client + vector_client: Vector database client redis_client: Redis client for distributed locks (optional but recommended) """ self.db = db_session - self.milvus = milvus_client + self.vector_client = vector_client self.redis = redis_client self.config = preference_config @@ -67,8 +67,8 @@ async def handle_preference_signal( weight = self.SIGNAL_WEIGHTS[signal_type] - # Get entry embedding from Milvus - embedding = await self.milvus.get_entry_embedding(entry_id) + # Get entry embedding from vector backend + embedding = await self.vector_client.get_entry_embedding(entry_id) if not embedding: # Entry not yet embedded, skip preference update return @@ -162,7 +162,7 @@ async def _update_preference_vector_locked( abs_weight: Absolute value of weight """ # Get current preference vectors - prefs = await self.milvus.get_user_preferences(user_id) + prefs = await self.vector_client.get_user_preferences(user_id) current = prefs.get(vector_type) if current is None: @@ -186,13 +186,13 @@ async def _update_preference_vector_locked( if norm > 1e-8: new_embedding = new_embedding / norm - # Store in Milvus - await self.milvus.upsert_user_preference( + # Store in vector backend + await self.vector_client.upsert_user_preference( user_id=user_id, vector_type=vector_type, embedding=new_embedding.tolist(), sample_count=new_count, - updated_at=int(datetime.now().timestamp()), + updated_at=int(datetime.now(UTC).timestamp()), ) async def _update_affinity_stats( @@ -263,8 +263,8 @@ async def rebuild_from_history(self, user_id: str) -> None: Args: user_id: User UUID """ - # Clear existing preferences from Milvus - await self.milvus.delete_user_preferences(user_id) + # Clear existing preferences from vector backend + await self.vector_client.delete_user_preferences(user_id) # Delete existing stats from database await self.db.execute( diff --git a/backend/packages/vector/glean_vector/services/score_service.py b/backend/packages/vector/glean_vector/services/score_service.py index 7360ee8a..aa285a0a 100644 --- a/backend/packages/vector/glean_vector/services/score_service.py +++ b/backend/packages/vector/glean_vector/services/score_service.py @@ -7,7 +7,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from glean_database.models import Entry, UserPreferenceStats -from glean_vector.clients.milvus_client import MilvusClient +from glean_vector.clients.vector_store import VectorStoreClient from glean_vector.config import preference_config @@ -22,17 +22,17 @@ class ScoreService: def __init__( self, db_session: AsyncSession, - milvus_client: MilvusClient, + vector_client: VectorStoreClient, ) -> None: """ Initialize score service. Args: db_session: Database session - milvus_client: Milvus vector database client + vector_client: Vector database client """ self.db = db_session - self.milvus = milvus_client + self.vector_client = vector_client self.pref_config = preference_config self._user_stats_cache: dict[str, UserPreferenceStats | None] = {} @@ -62,8 +62,8 @@ async def calculate_score( Returns: Dictionary with score and factors """ - # Get entry embedding from Milvus - embedding = await self.milvus.get_entry_embedding(entry_id) + # Get entry embedding from vector backend + embedding = await self.vector_client.get_entry_embedding(entry_id) if not embedding: # Entry not embedded yet, return default return { @@ -82,7 +82,7 @@ async def calculate_score( } # Get user preferences - prefs = await self.milvus.get_user_preferences(user_id) + prefs = await self.vector_client.get_user_preferences(user_id) if not prefs.get("positive") and not prefs.get("negative"): # No preference model yet @@ -194,7 +194,7 @@ async def batch_calculate_scores( scores: dict[str, float] = {} # Get user preferences once - prefs = await self.milvus.get_user_preferences(user_id) + prefs = await self.vector_client.get_user_preferences(user_id) # Get affinity stats once (and cache for any subsequent individual calls) stats = await self._get_user_stats(user_id) @@ -225,7 +225,7 @@ async def batch_calculate_scores( # Get all entry embeddings in batch entry_ids = [entry.id for entry in entries] - embeddings = await self.milvus.batch_get_entry_embeddings(entry_ids) + embeddings = await self.vector_client.batch_get_entry_embeddings(entry_ids) # Calculate scores for entry in entries: diff --git a/backend/packages/vector/glean_vector/services/validation_service.py b/backend/packages/vector/glean_vector/services/validation_service.py index 3ded400b..b4e4a3ee 100644 --- a/backend/packages/vector/glean_vector/services/validation_service.py +++ b/backend/packages/vector/glean_vector/services/validation_service.py @@ -1,21 +1,29 @@ """ Embedding validation service. -Provides validation for embedding providers and Milvus connections +Provides validation for embedding providers and vector backend connections before enabling vectorization. """ +import os + from glean_core import get_logger from glean_core.schemas.config import EmbeddingConfig, ValidationResult +from glean_vector.config import pgvector_config, vector_backend_config logger = get_logger(__name__) +def _quote_ident(name: str) -> str: + """Return a properly double-quoted PostgreSQL identifier.""" + return '"' + name.replace('"', '""') + '"' + + class EmbeddingValidationService: """ Service for validating embedding configuration. - Tests provider connections and Milvus availability before + Tests provider connections and vector backend availability before enabling vectorization to ensure the system will work correctly. """ @@ -187,8 +195,9 @@ async def validate_milvus( ValidationResult with success status and details. """ try: - from pymilvus import connections, utility + from pymilvus import Collection, connections, utility + from glean_vector.clients.milvus_client import MilvusClient from glean_vector.config import milvus_config try: @@ -209,6 +218,31 @@ async def validate_milvus( milvus_config.prefs_collection, using="validation" ) + collections_exist = entries_exists and prefs_exists + expected_signature = None + if provider and model and dimension: + expected_signature = f"{provider}:{model}:{dimension}" + + is_compatible = True + compatibility_reason: str | None = None + model_signatures: dict[str, str | None] = {} + + if expected_signature and collections_exist: + for target_name, collection_name in ( + ("entries", milvus_config.entries_collection), + ("preferences", milvus_config.prefs_collection), + ): + collection = Collection(collection_name, using="validation") + current_signature = MilvusClient.extract_model_signature(collection) + model_signatures[target_name] = current_signature + if current_signature != expected_signature: + is_compatible = False + compatibility_reason = ( + f"{target_name} signature mismatch: " + f"existing={current_signature}, expected={expected_signature}" + ) + break + logger.info("Milvus validation successful") return ValidationResult( success=True, @@ -220,6 +254,11 @@ async def validate_milvus( "entries_collection_exists": entries_exists, "prefs_collection": milvus_config.prefs_collection, "prefs_collection_exists": prefs_exists, + "collections_exist": collections_exist, + "is_compatible": is_compatible, + "compatibility_reason": compatibility_reason, + "expected_signature": expected_signature, + "model_signatures": model_signatures, "dimension": dimension, "provider": provider, "model": model, @@ -251,6 +290,139 @@ async def validate_milvus( }, ) + async def validate_pgvector( + self, + dimension: int | None = None, + provider: str | None = None, + model: str | None = None, + ) -> ValidationResult: + """ + Test pgvector backend connection. + """ + try: + from sqlalchemy import text + from sqlalchemy.ext.asyncio import create_async_engine + + database_url = pgvector_config.database_url or os.getenv("DATABASE_URL", "") + if not database_url: + return ValidationResult( + success=False, + message="PGVECTOR_DATABASE_URL or DATABASE_URL is required", + details={}, + ) + + engine = create_async_engine(database_url, echo=False) + try: + async with engine.connect() as conn: + result = await conn.execute( + text("SELECT extname FROM pg_extension WHERE extname='vector'") + ) + has_extension = result.scalar_one_or_none() is not None + + entries_regclass = await conn.execute( + text("SELECT to_regclass(:table_name)"), + {"table_name": pgvector_config.entries_table}, + ) + prefs_regclass = await conn.execute( + text("SELECT to_regclass(:table_name)"), + {"table_name": pgvector_config.prefs_table}, + ) + metadata_regclass = await conn.execute( + text("SELECT to_regclass(:table_name)"), + {"table_name": pgvector_config.metadata_table}, + ) + + entries_exists = entries_regclass.scalar_one_or_none() is not None + prefs_exists = prefs_regclass.scalar_one_or_none() is not None + metadata_exists = metadata_regclass.scalar_one_or_none() is not None + + collections_exist = entries_exists and prefs_exists + expected_signature = None + if provider and model and dimension: + expected_signature = f"{provider}:{model}:{dimension}" + + is_compatible = True + compatibility_reason: str | None = None + model_signatures: dict[str, str] = {} + + if expected_signature and collections_exist: + if metadata_exists: + rows = await conn.exec_driver_sql( + f"SELECT name, model_signature " + f"FROM {_quote_ident(pgvector_config.metadata_table)} " + "WHERE name IN ('entries', 'preferences')" + ) + model_signatures = {str(row[0]): str(row[1]) for row in rows.fetchall()} + for target_name in ("entries", "preferences"): + current_signature = model_signatures.get(target_name) + if not current_signature: + is_compatible = False + compatibility_reason = ( + f"Missing model signature for {target_name}" + ) + break + if current_signature != expected_signature: + is_compatible = False + compatibility_reason = ( + f"{target_name} signature mismatch: " + f"existing={current_signature}, expected={expected_signature}" + ) + break + else: + is_compatible = False + compatibility_reason = ( + "Model metadata missing for existing pgvector tables" + ) + + return ValidationResult( + success=True, + message="pgvector connection successful", + details={ + "database_url_configured": True, + "vector_extension_installed": has_extension, + "entries_table_exists": entries_exists, + "prefs_table_exists": prefs_exists, + "metadata_table_exists": metadata_exists, + "collections_exist": collections_exist, + "is_compatible": is_compatible, + "compatibility_reason": compatibility_reason, + "expected_signature": expected_signature, + "model_signatures": model_signatures, + "dimension": dimension, + "provider": provider, + "model": model, + }, + ) + finally: + await engine.dispose() + except Exception as e: + logger.error(f"pgvector validation failed: {e}") + return ValidationResult( + success=False, + message=f"pgvector connection failed: {str(e)}", + details={"error": str(e), "error_type": type(e).__name__}, + ) + + async def validate_vector_backend( + self, + dimension: int | None = None, + provider: str | None = None, + model: str | None = None, + ) -> ValidationResult: + """ + Validate configured vector backend. + """ + backend = vector_backend_config.backend.lower() + if backend == "milvus": + return await self.validate_milvus(dimension, provider, model) + if backend == "pgvector": + return await self.validate_pgvector(dimension, provider, model) + return ValidationResult( + success=False, + message=f"Unsupported vector backend: {vector_backend_config.backend}", + details={"backend": vector_backend_config.backend}, + ) + async def validate_full(self, config: EmbeddingConfig) -> ValidationResult: """ Perform full validation of provider and Milvus. @@ -266,15 +438,19 @@ async def validate_full(self, config: EmbeddingConfig) -> ValidationResult: if not provider_result.success: return provider_result - # Validate Milvus - milvus_result = await self.validate_milvus(config.dimension) - if not milvus_result.success: + # Validate vector backend + backend_result = await self.validate_vector_backend( + config.dimension, + config.provider, + config.model, + ) + if not backend_result.success: return ValidationResult( success=False, - message=f"Milvus validation failed: {milvus_result.message}", + message=f"Vector backend validation failed: {backend_result.message}", details={ "provider_validation": provider_result.details, - "milvus_validation": milvus_result.details, + "backend_validation": backend_result.details, }, ) @@ -283,7 +459,7 @@ async def validate_full(self, config: EmbeddingConfig) -> ValidationResult: message="Full validation successful", details={ "provider": provider_result.details, - "milvus": milvus_result.details, + "backend": backend_result.details, }, ) @@ -300,12 +476,12 @@ async def check_provider_health(self, config: EmbeddingConfig) -> bool: result = await self.validate_provider(config) return result.success - async def check_milvus_health(self) -> bool: + async def check_vector_backend_health(self) -> bool: """ - Quick health check for Milvus. + Quick health check for active vector backend. Returns: - True if Milvus is healthy, False otherwise. + True if backend is healthy, False otherwise. """ - result = await self.validate_milvus() + result = await self.validate_vector_backend() return result.success diff --git a/backend/packages/vector/pyproject.toml b/backend/packages/vector/pyproject.toml index 840200d3..bbe14960 100644 --- a/backend/packages/vector/pyproject.toml +++ b/backend/packages/vector/pyproject.toml @@ -5,6 +5,7 @@ description = "Glean Vector - Embedding and Preference Services" requires-python = ">=3.11" dependencies = [ "pymilvus>=2.6.3", + "pgvector>=0.3.6", "openai>=1.0.0", "numpy>=1.24.0", "httpx>=0.27.0", diff --git a/backend/packages/vector/tests/test_embedding_service.py b/backend/packages/vector/tests/test_embedding_service.py new file mode 100644 index 00000000..0320edf0 --- /dev/null +++ b/backend/packages/vector/tests/test_embedding_service.py @@ -0,0 +1,69 @@ +"""Tests for EmbeddingService transaction handling.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from glean_database.models import Entry +from glean_vector.clients.embedding_client import EmbeddingClient +from glean_vector.services.embedding_service import EmbeddingService + + +@pytest.mark.asyncio +async def test_generate_embedding_rolls_back_before_marking_failed() -> None: + """Roll back a failed DB transaction before writing the failed status.""" + session = AsyncMock(spec=AsyncSession) + embedding_client = AsyncMock(spec=EmbeddingClient) + vector_client = AsyncMock() + + entry = MagicMock(spec=Entry) + entry.id = "entry-1" + entry.title = "Test entry" + entry.readability_content = None + entry.content = "Content to embed" + entry.summary = None + entry.embedding_status = "pending" + entry.feed_id = "feed-1" + entry.published_at = None + entry.author = None + + select_result = MagicMock() + select_result.scalar_one_or_none.return_value = entry + + call_count = 0 + + async def execute_side_effect(*args, **kwargs): + nonlocal call_count + call_count += 1 + + if call_count == 1: + return select_result + if call_count == 2: + return MagicMock() + if call_count == 3: + raise SQLAlchemyError("status update failed") + + assert session.rollback.await_count == 1 + return MagicMock() + + session.execute.side_effect = execute_side_effect + session.flush = AsyncMock() + session.rollback = AsyncMock() + + embedding_client.generate_embedding.return_value = ([0.1, 0.2, 0.3], {}) + vector_client.insert_entry_embedding = AsyncMock() + + service = EmbeddingService( + db_session=session, + embedding_client=embedding_client, + vector_client=vector_client, + ) + + with pytest.raises(SQLAlchemyError): + await service.generate_embedding("entry-1") + + session.rollback.assert_awaited_once() + assert session.execute.await_count == 4 + assert session.flush.await_count == 2 diff --git a/backend/packages/vector/tests/test_milvus_error_handling.py b/backend/packages/vector/tests/test_milvus_error_handling.py index 62f18bb0..5143decf 100644 --- a/backend/packages/vector/tests/test_milvus_error_handling.py +++ b/backend/packages/vector/tests/test_milvus_error_handling.py @@ -129,3 +129,25 @@ def has_collection_side_effect(name: str) -> bool: assert is_compatible is False assert reason is not None assert "Preferences collection" in reason + + def test_check_model_compatibility_missing_signature_requires_rebuild(self) -> None: + """Existing collections without model metadata must not be treated as compatible.""" + client = MilvusClient() + + mock_collection = MagicMock() + mock_collection.description = "Entry embeddings without signature" + + with ( + patch.object(client, "connect"), + patch("glean_vector.clients.milvus_client.utility.has_collection", return_value=True), + patch("glean_vector.clients.milvus_client.Collection", return_value=mock_collection), + ): + is_compatible, reason = client.check_model_compatibility( + dimension=1536, + provider="openai", + model="text-embedding-3-small", + ) + + assert is_compatible is False + assert reason is not None + assert "existing=None" in reason diff --git a/backend/packages/vector/tests/test_pgvector_client.py b/backend/packages/vector/tests/test_pgvector_client.py new file mode 100644 index 00000000..7fba4b60 --- /dev/null +++ b/backend/packages/vector/tests/test_pgvector_client.py @@ -0,0 +1,126 @@ +"""Tests for PgVectorClient behavior.""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest +from sqlalchemy import BIGINT, Column, Float, Integer, MetaData, String, Table + +from glean_vector.clients.pgvector_client import PgVectorClient + + +def _build_client_tables(client: PgVectorClient) -> None: + metadata = MetaData() + client._metadata = metadata + client._entries_table = Table( + "entry_embeddings", + metadata, + Column("id", String(36), primary_key=True), + Column("embedding", String), + Column("feed_id", String(36)), + Column("published_at", BIGINT), + Column("language", String(10)), + Column("word_count", Integer), + Column("author", String(200)), + ) + client._prefs_table = Table( + "user_preference_vectors", + metadata, + Column("id", String(50), primary_key=True), + Column("user_id", String(36)), + Column("vector_type", String(20)), + Column("embedding", String), + Column("sample_count", Float), + Column("updated_at", BIGINT), + ) + client._meta_table = Table( + "vector_store_metadata", + metadata, + Column("name", String(50), primary_key=True), + Column("model_signature", String(255)), + Column("updated_at", BIGINT), + ) + + +def test_pgvector_client_defaults_require_rebuild() -> None: + """Compatibility checks default to rebuild-required path.""" + client = PgVectorClient() + compatible, reason = client.check_model_compatibility(1536, "openai", "text-embedding-3-small") + assert compatible is False + assert reason is not None + + +def test_pgvector_client_collections_exist_defaults_false() -> None: + """collections_exist returns False before backend initialization.""" + client = PgVectorClient() + assert client.collections_exist() is False + + +@pytest.mark.asyncio +async def test_pgvector_client_ensure_collections_is_idempotent() -> None: + """Repeated ensure_collections calls should not rewrite schema metadata.""" + client = PgVectorClient() + client._connected = True + _build_client_tables(client) + + sql_calls: list[str] = [] + + class FakeConn: + async def exec_driver_sql(self, sql: str) -> None: + sql_calls.append(sql) + + async def run_sync(self, fn) -> None: + return None + + class FakeBegin: + async def __aenter__(self) -> FakeConn: + return FakeConn() + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + client._engine = SimpleNamespace(begin=lambda: FakeBegin()) + client._execute = AsyncMock() + client._load_model_signature = AsyncMock(return_value="openai:text-embedding-3-small:1536") + + await client.ensure_collections(1536, "openai", "text-embedding-3-small") + await client.ensure_collections(1536, "openai", "text-embedding-3-small") + + assert len(sql_calls) == 4 + assert client._execute.await_count == 0 + client._load_model_signature.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_pgvector_client_recreate_writes_model_metadata() -> None: + """recreate_collections is the only path that stamps a new model signature.""" + client = PgVectorClient() + client._connected = True + _build_client_tables(client) + + sql_calls: list[str] = [] + + class FakeConn: + async def exec_driver_sql(self, sql: str) -> None: + sql_calls.append(sql) + + async def run_sync(self, fn) -> None: + return None + + class FakeBegin: + async def __aenter__(self) -> FakeConn: + return FakeConn() + + async def __aexit__(self, exc_type, exc, tb) -> None: + return None + + client._engine = SimpleNamespace(begin=lambda: FakeBegin()) + client._write_model_metadata = AsyncMock() + + await client.recreate_collections(1536, "openai", "text-embedding-3-small") + + # Rebuild must TRUNCATE (not DROP) so Alembic-managed schema, foreign keys + # and indexes survive the operation. + assert any("TRUNCATE TABLE" in call for call in sql_calls) + assert not any("DROP TABLE" in call for call in sql_calls) + client._write_model_metadata.assert_awaited_once_with("openai:text-embedding-3-small:1536") diff --git a/backend/packages/vector/tests/test_preference_race_condition.py b/backend/packages/vector/tests/test_preference_race_condition.py index db12c6a5..1e10d427 100644 --- a/backend/packages/vector/tests/test_preference_race_condition.py +++ b/backend/packages/vector/tests/test_preference_race_condition.py @@ -8,7 +8,7 @@ from redis.asyncio import Redis from sqlalchemy.ext.asyncio import AsyncSession -from glean_vector.clients.milvus_client import MilvusClient +from glean_vector.clients.vector_store import VectorStoreClient from glean_vector.services.preference_service import PreferenceService @@ -19,9 +19,9 @@ def mock_db_session(): @pytest.fixture -def mock_milvus_client(): - """Create a mock Milvus client.""" - client = MagicMock(spec=MilvusClient) +def mock_vector_client(): + """Create a mock vector client.""" + client = MagicMock(spec=VectorStoreClient) # Mock get_user_preferences to return initial state # This will be called multiple times during concurrent updates @@ -86,7 +86,7 @@ async def release(self): @pytest.mark.asyncio async def test_concurrent_preference_updates_with_lock( - mock_db_session, mock_milvus_client, mock_redis_client + mock_db_session, mock_vector_client, mock_redis_client ): """ Test that concurrent preference updates don't cause race conditions when using Redis locks. @@ -99,7 +99,7 @@ async def test_concurrent_preference_updates_with_lock( """ service = PreferenceService( db_session=mock_db_session, - milvus_client=mock_milvus_client, + vector_client=mock_vector_client, redis_client=mock_redis_client, ) @@ -122,11 +122,11 @@ async def test_concurrent_preference_updates_with_lock( await asyncio.gather(*tasks) # Verify both updates completed (called twice) - assert mock_milvus_client.upsert_user_preference.call_count == 2 + assert mock_vector_client.upsert_user_preference.call_count == 2 # Check the final call's sample_count # With proper locking, the second update should see the first update's result - calls = mock_milvus_client.upsert_user_preference.call_args_list + calls = mock_vector_client.upsert_user_preference.call_args_list # First call should have count around 6.0 (5.0 + 1.0) first_call_count = calls[0][1]["sample_count"] @@ -143,11 +143,11 @@ async def test_concurrent_preference_updates_with_lock( @pytest.mark.asyncio -async def test_preference_update_without_redis(mock_db_session, mock_milvus_client): +async def test_preference_update_without_redis(mock_db_session, mock_vector_client): """Test that preference updates work even without Redis (degraded mode).""" service = PreferenceService( db_session=mock_db_session, - milvus_client=mock_milvus_client, + vector_client=mock_vector_client, redis_client=None, # No Redis ) @@ -158,11 +158,11 @@ async def test_preference_update_without_redis(mock_db_session, mock_milvus_clie await service._update_preference_vector(user_id, embedding, 1.0) # Verify update completed - assert mock_milvus_client.upsert_user_preference.call_count == 1 + assert mock_vector_client.upsert_user_preference.call_count == 1 @pytest.mark.asyncio -async def test_lock_timeout_handling(mock_db_session, mock_milvus_client): +async def test_lock_timeout_handling(mock_db_session, mock_vector_client): """Test handling of lock acquisition timeout.""" # Create a Redis mock that always fails to acquire lock redis = MagicMock(spec=Redis) @@ -185,7 +185,7 @@ async def release(self): service = PreferenceService( db_session=mock_db_session, - milvus_client=mock_milvus_client, + vector_client=mock_vector_client, redis_client=redis, ) @@ -198,16 +198,16 @@ async def release(self): @pytest.mark.asyncio -async def test_lock_release_on_exception(mock_db_session, mock_milvus_client, mock_redis_client): +async def test_lock_release_on_exception(mock_db_session, mock_vector_client, mock_redis_client): """Test that locks are released even when exceptions occur.""" service = PreferenceService( db_session=mock_db_session, - milvus_client=mock_milvus_client, + vector_client=mock_vector_client, redis_client=mock_redis_client, ) # Make get_user_preferences raise an exception - mock_milvus_client.get_user_preferences = AsyncMock(side_effect=Exception("Milvus error")) + mock_vector_client.get_user_preferences = AsyncMock(side_effect=Exception("Milvus error")) user_id = "test-user-error" embedding = np.random.rand(384).tolist() @@ -217,7 +217,7 @@ async def test_lock_release_on_exception(mock_db_session, mock_milvus_client, mo await service._update_preference_vector(user_id, embedding, 1.0) # Lock should have been released - verify by checking another update succeeds - mock_milvus_client.get_user_preferences = AsyncMock( + mock_vector_client.get_user_preferences = AsyncMock( return_value={ "positive": { "embedding": np.random.rand(384).tolist(), @@ -228,17 +228,17 @@ async def test_lock_release_on_exception(mock_db_session, mock_milvus_client, mo # This should succeed (lock was released) await service._update_preference_vector(user_id, embedding, 1.0) - assert mock_milvus_client.upsert_user_preference.call_count == 1 + assert mock_vector_client.upsert_user_preference.call_count == 1 @pytest.mark.asyncio async def test_separate_locks_per_vector_type( - mock_db_session, mock_milvus_client, mock_redis_client + mock_db_session, mock_vector_client, mock_redis_client ): """Test that positive and negative preferences have separate locks.""" service = PreferenceService( db_session=mock_db_session, - milvus_client=mock_milvus_client, + vector_client=mock_vector_client, redis_client=mock_redis_client, ) @@ -246,7 +246,7 @@ async def test_separate_locks_per_vector_type( embedding = np.random.rand(384).tolist() # Mock to return both positive and negative preferences - mock_milvus_client.get_user_preferences = AsyncMock( + mock_vector_client.get_user_preferences = AsyncMock( return_value={ "positive": { "embedding": np.random.rand(384).tolist(), @@ -275,4 +275,4 @@ async def test_separate_locks_per_vector_type( assert duration < 1.0, f"Concurrent updates of different types took too long: {duration}s" # Both should complete - assert mock_milvus_client.upsert_user_preference.call_count == 2 + assert mock_vector_client.upsert_user_preference.call_count == 2 diff --git a/backend/packages/vector/tests/test_score_service.py b/backend/packages/vector/tests/test_score_service.py index cf6e45b1..636a31e4 100644 --- a/backend/packages/vector/tests/test_score_service.py +++ b/backend/packages/vector/tests/test_score_service.py @@ -6,7 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from glean_database.models import Entry, UserPreferenceStats -from glean_vector.clients.milvus_client import MilvusClient +from glean_vector.clients.vector_store import VectorStoreClient from glean_vector.services.score_service import ScoreService @@ -18,9 +18,9 @@ def mock_db_session(): @pytest.fixture -def mock_milvus_client(): - """Create a mock Milvus client.""" - client = MagicMock(spec=MilvusClient) +def mock_vector_client(): + """Create a mock vector client.""" + client = MagicMock(spec=VectorStoreClient) return client @@ -46,7 +46,7 @@ def mock_user_stats(): @pytest.mark.asyncio async def test_calculate_score_caches_user_stats( - mock_db_session, mock_milvus_client, mock_entry, mock_user_stats + mock_db_session, mock_vector_client, mock_entry, mock_user_stats ): """Test that calculate_score caches UserPreferenceStats to avoid N+1 queries.""" # Setup mocks - return a coroutine-like object that resolves correctly @@ -63,13 +63,13 @@ async def mock_execute(*args, **kwargs): mock_db_session.execute = mock_execute - mock_milvus_client.get_entry_embedding.return_value = [0.1] * 768 - mock_milvus_client.get_user_preferences.return_value = { + mock_vector_client.get_entry_embedding.return_value = [0.1] * 768 + mock_vector_client.get_user_preferences.return_value = { "positive": {"embedding": [0.2] * 768, "sample_count": 10}, "negative": {"embedding": [0.1] * 768, "sample_count": 5}, } - service = ScoreService(db_session=mock_db_session, milvus_client=mock_milvus_client) + service = ScoreService(db_session=mock_db_session, vector_client=mock_vector_client) # Call calculate_score multiple times for the same user await service.calculate_score("user-1", "entry-1", entry=mock_entry) @@ -81,7 +81,7 @@ async def mock_execute(*args, **kwargs): @pytest.mark.asyncio -async def test_get_user_stats_cache_works(mock_db_session, mock_milvus_client, mock_user_stats): +async def test_get_user_stats_cache_works(mock_db_session, mock_vector_client, mock_user_stats): """Test that _get_user_stats properly caches results.""" # Setup mock mock_result = MagicMock() @@ -95,7 +95,7 @@ async def mock_execute(*args, **kwargs): mock_db_session.execute = mock_execute - service = ScoreService(db_session=mock_db_session, milvus_client=mock_milvus_client) + service = ScoreService(db_session=mock_db_session, vector_client=mock_vector_client) # Call _get_user_stats multiple times stats1 = await service._get_user_stats("user-1") @@ -111,7 +111,7 @@ async def mock_execute(*args, **kwargs): @pytest.mark.asyncio async def test_batch_calculate_scores_populates_cache( - mock_db_session, mock_milvus_client, mock_user_stats + mock_db_session, mock_vector_client, mock_user_stats ): """Test that batch_calculate_scores populates the cache.""" # Setup mocks @@ -126,10 +126,10 @@ async def mock_execute(*args, **kwargs): mock_db_session.execute = mock_execute - mock_milvus_client.get_user_preferences.return_value = { + mock_vector_client.get_user_preferences.return_value = { "positive": {"embedding": [0.2] * 768, "sample_count": 10}, } - mock_milvus_client.batch_get_entry_embeddings.return_value = { + mock_vector_client.batch_get_entry_embeddings.return_value = { "entry-1": [0.1] * 768, "entry-2": [0.15] * 768, } @@ -145,7 +145,7 @@ async def mock_execute(*args, **kwargs): entry2.feed_id = "feed-2" entry2.author = "Author 2" - service = ScoreService(db_session=mock_db_session, milvus_client=mock_milvus_client) + service = ScoreService(db_session=mock_db_session, vector_client=mock_vector_client) # Call batch_calculate_scores await service.batch_calculate_scores("user-1", [entry1, entry2]) @@ -159,7 +159,7 @@ async def mock_execute(*args, **kwargs): @pytest.mark.asyncio -async def test_cache_isolates_different_users(mock_db_session, mock_milvus_client, mock_entry): +async def test_cache_isolates_different_users(mock_db_session, mock_vector_client, mock_entry): """Test that cache correctly isolates stats for different users.""" # Setup mocks for two different users stats_user1 = MagicMock(spec=UserPreferenceStats) @@ -179,7 +179,7 @@ async def mock_execute(query): mock_db_session.execute = mock_execute - service = ScoreService(db_session=mock_db_session, milvus_client=mock_milvus_client) + service = ScoreService(db_session=mock_db_session, vector_client=mock_vector_client) # Get stats for both users stats1 = await service._get_user_stats("user-1") diff --git a/backend/packages/vector/tests/test_vector_config.py b/backend/packages/vector/tests/test_vector_config.py new file mode 100644 index 00000000..888daff0 --- /dev/null +++ b/backend/packages/vector/tests/test_vector_config.py @@ -0,0 +1,18 @@ +"""Tests for vector configuration validation.""" + +import pytest +from pydantic import ValidationError + +from glean_vector.config import VectorBackendConfig + + +def test_vector_backend_config_normalizes_backend() -> None: + """Should accept uppercase env-style values and normalize them.""" + config = VectorBackendConfig(backend="PGVECTOR") + assert config.backend == "pgvector" + + +def test_vector_backend_config_rejects_unknown_backend() -> None: + """Should fail fast on unsupported vector backends.""" + with pytest.raises(ValidationError, match="VECTOR_BACKEND must be either"): + VectorBackendConfig(backend="clickhouse") diff --git a/backend/packages/vector/tests/test_vector_factory.py b/backend/packages/vector/tests/test_vector_factory.py new file mode 100644 index 00000000..506bbebc --- /dev/null +++ b/backend/packages/vector/tests/test_vector_factory.py @@ -0,0 +1,27 @@ +"""Tests for vector backend client factory.""" + +import pytest + +from glean_vector.clients import MilvusClient, PgVectorClient, create_vector_store_client +from glean_vector.config import vector_backend_config + + +def test_create_vector_store_client_milvus(monkeypatch: pytest.MonkeyPatch) -> None: + """Factory returns MilvusClient when backend is milvus.""" + monkeypatch.setattr(vector_backend_config, "backend", "milvus") + client = create_vector_store_client() + assert isinstance(client, MilvusClient) + + +def test_create_vector_store_client_pgvector(monkeypatch: pytest.MonkeyPatch) -> None: + """Factory returns PgVectorClient when backend is pgvector.""" + monkeypatch.setattr(vector_backend_config, "backend", "pgvector") + client = create_vector_store_client() + assert isinstance(client, PgVectorClient) + + +def test_create_vector_store_client_unsupported(monkeypatch: pytest.MonkeyPatch) -> None: + """Factory raises on unsupported backend.""" + monkeypatch.setattr(vector_backend_config, "backend", "unknown") + with pytest.raises(ValueError, match="Unsupported vector backend"): + create_vector_store_client() diff --git a/backend/uv.lock b/backend/uv.lock index 5ed930c7..071956cc 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -700,7 +700,7 @@ requires-dist = [ [[package]] name = "glean-backend" -version = "0.2.5" +version = "0.2.6" source = { virtual = "." } dependencies = [ { name = "httpx" }, @@ -766,6 +766,7 @@ dependencies = [ { name = "alembic" }, { name = "asyncpg" }, { name = "greenlet" }, + { name = "pgvector" }, { name = "sqlalchemy", extra = ["asyncio"] }, ] @@ -774,6 +775,7 @@ requires-dist = [ { name = "alembic", specifier = ">=1.13.0" }, { name = "asyncpg", specifier = ">=0.29.0" }, { name = "greenlet", specifier = ">=3.0.0" }, + { name = "pgvector", specifier = ">=0.3.6" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.0" }, ] @@ -808,6 +810,7 @@ dependencies = [ { name = "loguru" }, { name = "numpy" }, { name = "openai" }, + { name = "pgvector" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pymilvus" }, @@ -823,6 +826,7 @@ requires-dist = [ { name = "loguru", specifier = ">=0.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, { name = "openai", specifier = ">=1.0.0" }, + { name = "pgvector", specifier = ">=0.3.6" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pydantic-settings", specifier = ">=2.0.0" }, { name = "pymilvus", specifier = ">=2.6.3" }, @@ -1989,6 +1993,18 @@ bcrypt = [ { name = "bcrypt" }, ] +[[package]] +name = "pgvector" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/25/6c/6d8b4b03b958c02fa8687ec6063c49d952a189f8c91ebbe51e877dfab8f7/pgvector-0.4.2.tar.gz", hash = "sha256:322cac0c1dc5d41c9ecf782bd9991b7966685dee3a00bc873631391ed949513a", size = 31354, upload-time = "2025-12-05T01:07:17.87Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/26/6cee8a1ce8c43625ec561aff19df07f9776b7525d9002c86bceb3e0ac970/pgvector-0.4.2-py3-none-any.whl", hash = "sha256:549d45f7a18593783d5eec609ea1684a724ba8405c4cb182a0b2b08aeff04e08", size = 27441, upload-time = "2025-12-05T01:07:16.536Z" }, +] + [[package]] name = "pillow" version = "12.0.0" diff --git a/docker-compose.dev.pgvector.yml b/docker-compose.dev.pgvector.yml new file mode 100644 index 00000000..94d44c7c --- /dev/null +++ b/docker-compose.dev.pgvector.yml @@ -0,0 +1,48 @@ +# Development infrastructure for pgvector backend +# Usage: docker compose -f docker-compose.dev.pgvector.yml up -d + +services: + postgres: + image: pgvector/pgvector:pg16 + container_name: glean-dev-postgres + restart: unless-stopped + environment: + POSTGRES_DB: glean + POSTGRES_USER: glean + POSTGRES_PASSWORD: devpassword + ports: + - "5432:5432" + volumes: + - postgres_dev_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U glean"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - glean-dev-network + + redis: + image: redis:7-alpine + container_name: glean-dev-redis + command: redis-server --appendonly yes + restart: unless-stopped + ports: + - "6379:6379" + volumes: + - redis_dev_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - glean-dev-network + +volumes: + postgres_dev_data: + redis_dev_data: + +networks: + glean-dev-network: + driver: bridge diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index f1d98248..89d15f1d 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -8,7 +8,7 @@ services: postgres: - image: postgres:16-alpine + image: pgvector/pgvector:pg16 container_name: glean-dev-postgres restart: unless-stopped environment: diff --git a/docker-compose.lite.yml b/docker-compose.pgvector.yml similarity index 79% rename from docker-compose.lite.yml rename to docker-compose.pgvector.yml index dc11f7b2..e8e11a67 100644 --- a/docker-compose.lite.yml +++ b/docker-compose.pgvector.yml @@ -1,25 +1,10 @@ -# Glean Docker Compose Configuration (Lite Version) -# This configuration excludes Milvus services for lighter deployments -# +# Glean Docker Compose Configuration (pgvector backend) # Usage: -# Lite deployment: docker compose -f docker-compose.lite.yml up -d -# -# For full deployment with Milvus (Phase 3 features), use: -# docker compose up -d -# -# Environment variables can be set in .env file: -# - IMAGE_TAG: Docker image version (default: latest, e.g., v0.3.0-alpha.1) -# - WEB_PORT: Web interface port (default: 80) -# - ADMIN_PORT: Admin dashboard port (default: 3001) -# - POSTGRES_DB/USER/PASSWORD: Database credentials -# - SECRET_KEY: JWT signing key -# - CREATE_ADMIN: Create admin account on startup (default: true) -# - ADMIN_USERNAME/PASSWORD: Admin credentials (default: admin/Admin123!) -# - DEBUG: Enable debug mode (default: false) +# docker compose -f docker-compose.pgvector.yml up -d services: postgres: - image: postgres:16-alpine + image: pgvector/pgvector:pg16 container_name: glean-postgres environment: POSTGRES_DB: ${POSTGRES_DB:-glean} @@ -60,14 +45,16 @@ services: SECRET_KEY: ${SECRET_KEY:-change-me-in-production-use-a-long-random-string} CORS_ORIGINS: '["http://localhost", "http://localhost:3000", "http://localhost:3001"]' DEBUG: ${DEBUG:-false} - # Only backend runs migrations to avoid race conditions + VECTOR_BACKEND: pgvector RUN_MIGRATIONS: "true" - # Admin auto-creation (enabled by default, set to false to disable) CREATE_ADMIN: ${CREATE_ADMIN:-true} ADMIN_USERNAME: ${ADMIN_USERNAME:-admin} ADMIN_PASSWORD: ${ADMIN_PASSWORD:-Admin123!} ADMIN_ROLE: ${ADMIN_ROLE:-super_admin} - # Logging configuration + PGVECTOR_DATABASE_URL: ${PGVECTOR_DATABASE_URL:-} + PGVECTOR_ENTRIES_TABLE: ${PGVECTOR_ENTRIES_TABLE:-entry_embeddings} + PGVECTOR_PREFS_TABLE: ${PGVECTOR_PREFS_TABLE:-user_preference_vectors} + PGVECTOR_METADATA_TABLE: ${PGVECTOR_METADATA_TABLE:-vector_store_metadata} LOG_LEVEL: ${LOG_LEVEL:-INFO} LOG_FILE: ${LOG_FILE:-/app/logs/glean-api.log} LOG_ROTATION: ${LOG_ROTATION:-100 MB} @@ -99,7 +86,11 @@ services: DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-glean}:${POSTGRES_PASSWORD:-glean}@postgres:5432/${POSTGRES_DB:-glean} REDIS_URL: redis://redis:6379/0 SECRET_KEY: ${SECRET_KEY:-change-me-in-production-use-a-long-random-string} - # Logging configuration + VECTOR_BACKEND: pgvector + PGVECTOR_DATABASE_URL: ${PGVECTOR_DATABASE_URL:-} + PGVECTOR_ENTRIES_TABLE: ${PGVECTOR_ENTRIES_TABLE:-entry_embeddings} + PGVECTOR_PREFS_TABLE: ${PGVECTOR_PREFS_TABLE:-user_preference_vectors} + PGVECTOR_METADATA_TABLE: ${PGVECTOR_METADATA_TABLE:-vector_store_metadata} LOG_LEVEL: ${LOG_LEVEL:-INFO} LOG_FILE: ${LOG_FILE:-/app/logs/glean-worker.log} LOG_ROTATION: ${LOG_ROTATION:-100 MB} diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 971fe7bd..55d2728f 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -6,7 +6,7 @@ services: postgres-test: - image: postgres:16-alpine + image: pgvector/pgvector:pg16 container_name: glean-test-postgres environment: POSTGRES_DB: glean_test diff --git a/docker-compose.yml b/docker-compose.yml index b46de389..d6f60fc5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,7 @@ # Glean Docker Compose Configuration # Usage: -# Full deployment (with Milvus): docker compose up -d -# Lite deployment (without Milvus): docker compose -f docker-compose.lite.yml up -d +# pgvector deployment (recommended): docker compose -f docker-compose.pgvector.yml up -d +# Milvus deployment: docker compose up -d # Local development: docker compose -f docker-compose.yml -f docker-compose.override.yml up -d # # Environment variables can be set in .env file: @@ -13,11 +13,11 @@ # - CREATE_ADMIN: Create admin account on startup (default: true) # - ADMIN_USERNAME/PASSWORD: Admin credentials (default: admin/Admin123!) # - DEBUG: Enable debug mode (default: false) -# - MILVUS_HOST/PORT: Milvus connection (for Phase 3 features) +# - MILVUS_HOST/PORT: Milvus connection services: postgres: - image: postgres:16-alpine + image: pgvector/pgvector:pg16 container_name: glean-postgres environment: POSTGRES_DB: ${POSTGRES_DB:-glean} @@ -58,6 +58,7 @@ services: SECRET_KEY: ${SECRET_KEY:-change-me-in-production-use-a-long-random-string} CORS_ORIGINS: '["http://localhost", "http://localhost:3000", "http://localhost:3001"]' DEBUG: ${DEBUG:-false} + VECTOR_BACKEND: ${VECTOR_BACKEND:-milvus} # Only backend runs migrations to avoid race conditions RUN_MIGRATIONS: "true" # Admin auto-creation (enabled by default, set to false to disable) @@ -68,6 +69,10 @@ services: # Milvus configuration (for vector embeddings) MILVUS_HOST: milvus MILVUS_PORT: ${MILVUS_PORT:-19530} + PGVECTOR_DATABASE_URL: ${PGVECTOR_DATABASE_URL:-} + PGVECTOR_ENTRIES_TABLE: ${PGVECTOR_ENTRIES_TABLE:-entry_embeddings} + PGVECTOR_PREFS_TABLE: ${PGVECTOR_PREFS_TABLE:-user_preference_vectors} + PGVECTOR_METADATA_TABLE: ${PGVECTOR_METADATA_TABLE:-vector_store_metadata} # Logging configuration LOG_LEVEL: ${LOG_LEVEL:-INFO} LOG_FILE: ${LOG_FILE:-/app/logs/glean-api.log} @@ -102,9 +107,14 @@ services: DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-glean}:${POSTGRES_PASSWORD:-glean}@postgres:5432/${POSTGRES_DB:-glean} REDIS_URL: redis://redis:6379/0 SECRET_KEY: ${SECRET_KEY:-change-me-in-production-use-a-long-random-string} + VECTOR_BACKEND: ${VECTOR_BACKEND:-milvus} # Milvus configuration MILVUS_HOST: milvus MILVUS_PORT: ${MILVUS_PORT:-19530} + PGVECTOR_DATABASE_URL: ${PGVECTOR_DATABASE_URL:-} + PGVECTOR_ENTRIES_TABLE: ${PGVECTOR_ENTRIES_TABLE:-entry_embeddings} + PGVECTOR_PREFS_TABLE: ${PGVECTOR_PREFS_TABLE:-user_preference_vectors} + PGVECTOR_METADATA_TABLE: ${PGVECTOR_METADATA_TABLE:-vector_store_metadata} # Logging configuration LOG_LEVEL: ${LOG_LEVEL:-INFO} LOG_FILE: ${LOG_FILE:-/app/logs/glean-worker.log} @@ -150,7 +160,7 @@ services: - glean-network restart: unless-stopped - # Milvus services (for Phase 3 features: smart recommendations, preference learning) + # Milvus services milvus-etcd: image: quay.io/coreos/etcd:v3.5.5 container_name: glean-milvus-etcd