add workspace configuration and implement vector search enhancements

longshuicy · longshuicy · commit 8badda3ae5bb · 2026-02-19T19:41:05.000-06:00
- Introduced a new workspace configuration file for easier project management.
- Updated custom functions documentation to clarify database population steps.
- Added new SQL migration files for pgvector extension and embeddings table.
- Implemented `getDocGroupsForVectorSearch` function to streamline document group retrieval.
- Refactored `fetchContextsViaDrizzleVectorSearch` to utilize new embedding and document group fetching logic.
- Updated `updateDocGroupsInVectorStore` to use Drizzle for database updates instead of backend calls.
- Enhanced tests to cover new functionality and ensure proper integration.
diff --git a/src/db/custom-functions.ts b/src/db/custom-functions.ts
@@ -1,5 +1,5 @@
-// Populate the db: pgvector/embeddings (init-vector.sql) and custom functions (0001_custom_functions.sql).
-// Run after db:push. Requires psql and POSTGRES_* env.
+// Populate the db: custom functions (0001_custom_functions.sql). pgvector + embeddings table are applied via db:migrate (0006_pgvector_extension.sql, 0007_embeddings_table.sql).
+// Run after db:migrate if you need to re-run custom functions. Requires psql and POSTGRES_* env.
 import { spawn } from 'child_process'
 import path from 'path'
 import dotenv from 'dotenv'
@@ -8,7 +8,6 @@ dotenv.config()
 
 // Paths relative to project root (where npm run db:populate is run)
 const sqlFiles = [
-  path.join(process.cwd(), 'src/db/init-vector.sql'), // pgvector extension + embeddings table
   path.join(process.cwd(), 'src/db/migrations/0001_custom_functions.sql'),
 ]
 
diff --git a/src/db/dbHelpers.ts b/src/db/dbHelpers.ts
@@ -43,6 +43,31 @@ export async function fetchDocumentGroups(courseName: string) {
   }
 }
 
+/** Shape expected by vectorSearchWithDrizzle for doc-group filtering. */
+export interface DocGroupsForVectorSearch {
+  disabled_doc_groups: string[]
+  public_doc_groups: { course_name: string; name: string; enabled: boolean }[]
+}
+
+/**
+ * Get disabled and public doc groups for a course for use in vector search.
+ * Replaces backend getDisabledDocGroups / getPublicDocGroups.
+ */
+export async function getDocGroupsForVectorSearch(
+  courseName: string,
+): Promise<DocGroupsForVectorSearch> {
+  const rows = await fetchDocumentGroups(courseName)
+  const disabled_doc_groups = rows
+    .filter((r) => r.enabled === false)
+    .map((r) => r.name ?? '')
+  const public_doc_groups = rows.map((r) => ({
+    course_name: courseName,
+    name: r.name ?? '',
+    enabled: r.enabled ?? true,
+  }))
+  return { disabled_doc_groups, public_doc_groups }
+}
+
 export async function addDocumentsToDocGroup(
   courseName: string,
   doc: CourseDocument,
diff --git a/src/db/migrations/0006_pgvector_extension.sql b/src/db/migrations/0006_pgvector_extension.sql
@@ -0,0 +1,3 @@
+-- Enable pgvector extension (Drizzle does not create extensions automatically).
+-- Required for the embeddings table and vector similarity search.
+CREATE EXTENSION IF NOT EXISTS vector;
diff --git a/src/db/migrations/0007_embeddings_table.sql b/src/db/migrations/0007_embeddings_table.sql
@@ -0,0 +1,39 @@
+-- Table equivalent to Qdrant collection "illinois-chat-qwen"
+-- Vector size: 4096 (matching Qdrant collection). Distance: Cosine (pgvector <=> operator).
+CREATE TABLE IF NOT EXISTS embeddings (
+  id BIGSERIAL PRIMARY KEY,
+  qdrant_id UUID UNIQUE,
+  embedding VECTOR(4096) NOT NULL,
+  page_content TEXT,
+  course_name TEXT,
+  s3_path TEXT,
+  readable_filename TEXT,
+  url TEXT,
+  base_url TEXT,
+  doc_groups JSONB DEFAULT '[]'::jsonb,
+  chunk_index INTEGER,
+  pagenumber TEXT,
+  "timestamp" TEXT,
+  conversation_id TEXT,
+  metadata JSONB DEFAULT '{}'::jsonb,
+  created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+  updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE INDEX IF NOT EXISTS embeddings_course_name_idx ON embeddings (course_name);
+CREATE INDEX IF NOT EXISTS embeddings_s3_path_idx ON embeddings (s3_path);
+CREATE INDEX IF NOT EXISTS embeddings_conversation_id_idx ON embeddings (conversation_id) WHERE conversation_id IS NOT NULL AND conversation_id != '';
+CREATE INDEX IF NOT EXISTS embeddings_doc_groups_idx ON embeddings USING gin (doc_groups);
+CREATE INDEX IF NOT EXISTS embeddings_metadata_idx ON embeddings USING gin (metadata);
+
+CREATE OR REPLACE FUNCTION update_updated_at_column()
+RETURNS TRIGGER AS $$
+BEGIN
+    NEW.updated_at = CURRENT_TIMESTAMP;
+    RETURN NEW;
+END;
+$$ language 'plpgsql';
+
+DROP TRIGGER IF EXISTS update_embeddings_updated_at ON embeddings;
+CREATE TRIGGER update_embeddings_updated_at BEFORE UPDATE ON embeddings
+    FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
diff --git a/src/db/migrations/meta/_journal.json b/src/db/migrations/meta/_journal.json
@@ -43,6 +43,20 @@
       "when": 1769713616944,
       "tag": "0005_calm_valeria_richards",
       "breakpoints": true
+    },
+    {
+      "idx": 6,
+      "version": "7",
+      "when": 1769713700000,
+      "tag": "0006_pgvector_extension",
+      "breakpoints": true
+    },
+    {
+      "idx": 7,
+      "version": "7",
+      "when": 1769713700001,
+      "tag": "0007_embeddings_table",
+      "breakpoints": true
     }
   ]
 }
diff --git a/src/pages/util/fetchContexts.ts b/src/pages/util/fetchContexts.ts
@@ -1,35 +1,28 @@
 import { type ContextWithMetadata } from '~/types/chat'
-import { getBackendUrl } from '~/utils/apiUtils'
 import { vectorSearchWithDrizzle } from '~/db/vectorSearch'
+import { embedQuery } from '~/utils/embedQuery'
+import { getDocGroupsForVectorSearch } from '~/db/dbHelpers'
 
-/** Fetch embedding + disabled/public doc groups from backend, then run vector search on frontend DB (Drizzle/pgvector). */
+/** Fetch query embedding (frontend) + doc groups from frontend DB, then run vector search (Drizzle/pgvector). */
 export async function fetchContextsViaDrizzleVectorSearch(
   course_name: string,
   search_query: string,
   doc_groups: string[] = [],
   conversation_id?: string,
   top_n = 100,
 ): Promise<ContextWithMetadata[]> {
-  const backendUrl = getBackendUrl()
-  const embedRes = await fetch(`${backendUrl}/embedAndMetadata`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({ search_query, course_name }),
-  })
-  if (!embedRes.ok) {
-    throw new Error(
-      `Failed to get embedding/metadata. Status: ${embedRes.status}`,
-    )
-  }
-  const { embedding, disabled_doc_groups, public_doc_groups } =
-    await embedRes.json()
+  const [embedding, { disabled_doc_groups, public_doc_groups }] =
+    await Promise.all([
+      embedQuery(search_query),
+      getDocGroupsForVectorSearch(course_name),
+    ])
 
   return vectorSearchWithDrizzle({
     queryEmbedding: embedding,
     course_name,
     doc_groups,
-    disabled_doc_groups: disabled_doc_groups ?? [],
-    public_doc_groups: public_doc_groups ?? [],
+    disabled_doc_groups,
+    public_doc_groups,
     conversation_id,
     top_n,
   })
diff --git a/src/utils/__tests__/vectorUtils.test.ts b/src/utils/__tests__/vectorUtils.test.ts
@@ -1,15 +1,16 @@
 import { describe, expect, it, vi } from 'vitest'
 
+const mockWhere = vi.fn().mockResolvedValue(undefined)
+const mockSet = vi.fn().mockReturnValue({ where: mockWhere })
+
+vi.mock('~/db/dbClient', () => ({
+  db: {
+    update: () => ({ set: mockSet }),
+  },
+}))
+
 describe('vectorUtils', () => {
-  it('calls backend update-doc-groups and returns response with status completed', async () => {
-    vi.stubEnv('RAILWAY_URL', 'https://backend.example')
-    const fetchMock = vi.fn().mockResolvedValue({
-      ok: true,
-      json: async () => ({ status: 'completed' }),
-    })
-    vi.stubGlobal('fetch', fetchMock)
-
-    vi.resetModules()
+  it('updates embeddings via Drizzle and returns status completed', async () => {
     const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
 
     const doc = {
@@ -21,30 +22,14 @@ describe('vectorUtils', () => {
 
     const result = await updateDocGroupsInVectorStore('CS101', doc)
     expect(result).toEqual({ status: 'completed' })
-    expect(fetchMock).toHaveBeenCalledWith(
-      'https://backend.example/update-doc-groups',
+    expect(mockSet).toHaveBeenCalledWith(
       expect.objectContaining({
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          courseName: 'CS101',
-          s3_path: 's3://bucket/key',
-          url: 'https://example.com/doc',
-          doc_groups: ['g1', 'g2'],
-        }),
+        doc_groups: ['g1', 'g2'],
       }),
     )
   })
 
   it('sends empty url and s3_path when missing', async () => {
-    vi.stubEnv('RAILWAY_URL', 'https://backend.example')
-    const fetchMock = vi.fn().mockResolvedValue({
-      ok: true,
-      json: async () => ({ status: 'completed' }),
-    })
-    vi.stubGlobal('fetch', fetchMock)
-
-    vi.resetModules()
     const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
 
     const doc = {
@@ -54,29 +39,14 @@ describe('vectorUtils', () => {
       readable_filename: 'doc.pdf',
     } as any
 
-    await updateDocGroupsInVectorStore('CS101', doc)
-    expect(fetchMock).toHaveBeenCalledWith(
-      expect.any(String),
-      expect.objectContaining({
-        body: JSON.stringify({
-          courseName: 'CS101',
-          s3_path: '',
-          url: '',
-          doc_groups: ['g1'],
-        }),
-      }),
-    )
+    const result = await updateDocGroupsInVectorStore('CS101', doc)
+    expect(result).toEqual({ status: 'completed' })
   })
 
   it('captures posthog event and rethrows on error', async () => {
-    vi.stubEnv('RAILWAY_URL', 'https://backend.example')
     vi.spyOn(console, 'error').mockImplementation(() => {})
-    const fetchMock = vi
-      .fn()
-      .mockResolvedValue({ ok: false, text: async () => 'boom' })
-    vi.stubGlobal('fetch', fetchMock)
+    mockWhere.mockRejectedValueOnce(new Error('boom'))
 
-    vi.resetModules()
     const posthog = (await import('posthog-js')).default as any
     vi.spyOn(posthog, 'capture')
     const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
@@ -102,14 +72,9 @@ describe('vectorUtils', () => {
   })
 
   it('sets doc_unique_identifier to null when url and s3_path are empty', async () => {
-    vi.stubEnv('RAILWAY_URL', 'https://backend.example')
     vi.spyOn(console, 'error').mockImplementation(() => {})
-    const fetchMock = vi
-      .fn()
-      .mockResolvedValue({ ok: false, text: async () => 'err' })
-    vi.stubGlobal('fetch', fetchMock)
+    mockWhere.mockRejectedValueOnce(new Error('err'))
 
-    vi.resetModules()
     const posthog = (await import('posthog-js')).default as any
     vi.spyOn(posthog, 'capture')
     const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
@@ -129,14 +94,9 @@ describe('vectorUtils', () => {
   })
 
   it('sets doc_unique_identifier to url when url is present', async () => {
-    vi.stubEnv('RAILWAY_URL', 'https://backend.example')
     vi.spyOn(console, 'error').mockImplementation(() => {})
-    const fetchMock = vi
-      .fn()
-      .mockResolvedValue({ ok: false, text: async () => 'err' })
-    vi.stubGlobal('fetch', fetchMock)
+    mockWhere.mockRejectedValueOnce(new Error('err'))
 
-    vi.resetModules()
     const posthog = (await import('posthog-js')).default as any
     vi.spyOn(posthog, 'capture')
     const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
diff --git a/src/utils/embedQuery.ts b/src/utils/embedQuery.ts
@@ -0,0 +1,50 @@
+/**
+ * Server-side query embedding for vector search.
+ * Replaces backend /embedAndMetadata for the embedding part.
+ * Uses same env and behavior as backend: EMBEDDING_MODEL, OPENAI_API_KEY/NCSA_HOSTED_API_KEY,
+ * EMBEDDING_API_BASE, optional QWEN_QUERY_INSTRUCTION for Qwen models.
+ */
+
+import OpenAI from 'openai'
+
+const DEFAULT_QWEN_QUERY_INSTRUCTION =
+  'Given a user search query, retrieve the most relevant passages from the Illinois Chat knowledge base stored in the vector store to answer the query accurately. Prioritize authoritative course materials, syllabi, FAQs, official documentation, web pages, and other relevant sources. Ignore boilerplate/navigation text.'
+
+function getOpenAIClient(): OpenAI {
+  const apiKey =
+    process.env.OPENAI_API_KEY || process.env.NCSA_HOSTED_API_KEY || ''
+  const baseURL = process.env.EMBEDDING_API_BASE || 'https://api.openai.com/v1'
+  return new OpenAI({ apiKey, baseURL })
+}
+
+/**
+ * Generate embedding for a search query. Uses EMBEDDING_MODEL (default text-embedding-ada-002).
+ * For Qwen models, prefixes the query with QWEN_QUERY_INSTRUCTION when set.
+ */
+export async function embedQuery(searchQuery: string): Promise<number[]> {
+  const model = process.env.EMBEDDING_MODEL || 'text-embedding-ada-002'
+  const qwenInstruction =
+    process.env.QWEN_QUERY_INSTRUCTION || DEFAULT_QWEN_QUERY_INSTRUCTION
+
+  let input = searchQuery.replace(/\n/g, ' ').trim()
+  if (
+    qwenInstruction &&
+    typeof model === 'string' &&
+    model.toLowerCase().includes('qwen')
+  ) {
+    input = `Instruct: ${qwenInstruction}\nQuery:${searchQuery}`
+  }
+
+  const openai = getOpenAIClient()
+  const {
+    data: [result],
+  } = await openai.embeddings.create({
+    model,
+    input,
+  })
+
+  if (!result?.embedding) {
+    throw new Error('No embedding returned from embedding API')
+  }
+  return result.embedding
+}
diff --git a/src/utils/vectorUtils.ts b/src/utils/vectorUtils.ts
diff --git a/uiuc-chat-frontend.code-workspace b/uiuc-chat-frontend.code-workspace

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+-- Enable pgvector extension (Drizzle does not create extensions automatically).`
	`2`	`+-- Required for the embeddings table and vector similarity search.`
	`3`	`+CREATE EXTENSION IF NOT EXISTS vector;`