Skip to content

Commit 8badda3

Browse files
committed
add workspace configuration and implement vector search enhancements
- Introduced a new workspace configuration file for easier project management. - Updated custom functions documentation to clarify database population steps. - Added new SQL migration files for pgvector extension and embeddings table. - Implemented `getDocGroupsForVectorSearch` function to streamline document group retrieval. - Refactored `fetchContextsViaDrizzleVectorSearch` to utilize new embedding and document group fetching logic. - Updated `updateDocGroupsInVectorStore` to use Drizzle for database updates instead of backend calls. - Enhanced tests to cover new functionality and ensure proper integration.
1 parent a72bc25 commit 8badda3

File tree

10 files changed

+206
-98
lines changed

10 files changed

+206
-98
lines changed

src/db/custom-functions.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
// Populate the db: pgvector/embeddings (init-vector.sql) and custom functions (0001_custom_functions.sql).
2-
// Run after db:push. Requires psql and POSTGRES_* env.
1+
// Populate the db: custom functions (0001_custom_functions.sql). pgvector + embeddings table are applied via db:migrate (0006_pgvector_extension.sql, 0007_embeddings_table.sql).
2+
// Run after db:migrate if you need to re-run custom functions. Requires psql and POSTGRES_* env.
33
import { spawn } from 'child_process'
44
import path from 'path'
55
import dotenv from 'dotenv'
@@ -8,7 +8,6 @@ dotenv.config()
88

99
// Paths relative to project root (where npm run db:populate is run)
1010
const sqlFiles = [
11-
path.join(process.cwd(), 'src/db/init-vector.sql'), // pgvector extension + embeddings table
1211
path.join(process.cwd(), 'src/db/migrations/0001_custom_functions.sql'),
1312
]
1413

src/db/dbHelpers.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,31 @@ export async function fetchDocumentGroups(courseName: string) {
4343
}
4444
}
4545

46+
/** Shape expected by vectorSearchWithDrizzle for doc-group filtering. */
47+
export interface DocGroupsForVectorSearch {
48+
disabled_doc_groups: string[]
49+
public_doc_groups: { course_name: string; name: string; enabled: boolean }[]
50+
}
51+
52+
/**
53+
* Get disabled and public doc groups for a course for use in vector search.
54+
* Replaces backend getDisabledDocGroups / getPublicDocGroups.
55+
*/
56+
export async function getDocGroupsForVectorSearch(
57+
courseName: string,
58+
): Promise<DocGroupsForVectorSearch> {
59+
const rows = await fetchDocumentGroups(courseName)
60+
const disabled_doc_groups = rows
61+
.filter((r) => r.enabled === false)
62+
.map((r) => r.name ?? '')
63+
const public_doc_groups = rows.map((r) => ({
64+
course_name: courseName,
65+
name: r.name ?? '',
66+
enabled: r.enabled ?? true,
67+
}))
68+
return { disabled_doc_groups, public_doc_groups }
69+
}
70+
4671
export async function addDocumentsToDocGroup(
4772
courseName: string,
4873
doc: CourseDocument,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
-- Enable pgvector extension (Drizzle does not create extensions automatically).
2+
-- Required for the embeddings table and vector similarity search.
3+
CREATE EXTENSION IF NOT EXISTS vector;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
-- Table equivalent to Qdrant collection "illinois-chat-qwen"
2+
-- Vector size: 4096 (matching Qdrant collection). Distance: Cosine (pgvector <=> operator).
3+
CREATE TABLE IF NOT EXISTS embeddings (
4+
id BIGSERIAL PRIMARY KEY,
5+
qdrant_id UUID UNIQUE,
6+
embedding VECTOR(4096) NOT NULL,
7+
page_content TEXT,
8+
course_name TEXT,
9+
s3_path TEXT,
10+
readable_filename TEXT,
11+
url TEXT,
12+
base_url TEXT,
13+
doc_groups JSONB DEFAULT '[]'::jsonb,
14+
chunk_index INTEGER,
15+
pagenumber TEXT,
16+
"timestamp" TEXT,
17+
conversation_id TEXT,
18+
metadata JSONB DEFAULT '{}'::jsonb,
19+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
20+
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
21+
);
22+
23+
CREATE INDEX IF NOT EXISTS embeddings_course_name_idx ON embeddings (course_name);
24+
CREATE INDEX IF NOT EXISTS embeddings_s3_path_idx ON embeddings (s3_path);
25+
CREATE INDEX IF NOT EXISTS embeddings_conversation_id_idx ON embeddings (conversation_id) WHERE conversation_id IS NOT NULL AND conversation_id != '';
26+
CREATE INDEX IF NOT EXISTS embeddings_doc_groups_idx ON embeddings USING gin (doc_groups);
27+
CREATE INDEX IF NOT EXISTS embeddings_metadata_idx ON embeddings USING gin (metadata);
28+
29+
CREATE OR REPLACE FUNCTION update_updated_at_column()
30+
RETURNS TRIGGER AS $$
31+
BEGIN
32+
NEW.updated_at = CURRENT_TIMESTAMP;
33+
RETURN NEW;
34+
END;
35+
$$ language 'plpgsql';
36+
37+
DROP TRIGGER IF EXISTS update_embeddings_updated_at ON embeddings;
38+
CREATE TRIGGER update_embeddings_updated_at BEFORE UPDATE ON embeddings
39+
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();

src/db/migrations/meta/_journal.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,20 @@
4343
"when": 1769713616944,
4444
"tag": "0005_calm_valeria_richards",
4545
"breakpoints": true
46+
},
47+
{
48+
"idx": 6,
49+
"version": "7",
50+
"when": 1769713700000,
51+
"tag": "0006_pgvector_extension",
52+
"breakpoints": true
53+
},
54+
{
55+
"idx": 7,
56+
"version": "7",
57+
"when": 1769713700001,
58+
"tag": "0007_embeddings_table",
59+
"breakpoints": true
4660
}
4761
]
4862
}

src/pages/util/fetchContexts.ts

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,28 @@
11
import { type ContextWithMetadata } from '~/types/chat'
2-
import { getBackendUrl } from '~/utils/apiUtils'
32
import { vectorSearchWithDrizzle } from '~/db/vectorSearch'
3+
import { embedQuery } from '~/utils/embedQuery'
4+
import { getDocGroupsForVectorSearch } from '~/db/dbHelpers'
45

5-
/** Fetch embedding + disabled/public doc groups from backend, then run vector search on frontend DB (Drizzle/pgvector). */
6+
/** Fetch query embedding (frontend) + doc groups from frontend DB, then run vector search (Drizzle/pgvector). */
67
export async function fetchContextsViaDrizzleVectorSearch(
78
course_name: string,
89
search_query: string,
910
doc_groups: string[] = [],
1011
conversation_id?: string,
1112
top_n = 100,
1213
): Promise<ContextWithMetadata[]> {
13-
const backendUrl = getBackendUrl()
14-
const embedRes = await fetch(`${backendUrl}/embedAndMetadata`, {
15-
method: 'POST',
16-
headers: { 'Content-Type': 'application/json' },
17-
body: JSON.stringify({ search_query, course_name }),
18-
})
19-
if (!embedRes.ok) {
20-
throw new Error(
21-
`Failed to get embedding/metadata. Status: ${embedRes.status}`,
22-
)
23-
}
24-
const { embedding, disabled_doc_groups, public_doc_groups } =
25-
await embedRes.json()
14+
const [embedding, { disabled_doc_groups, public_doc_groups }] =
15+
await Promise.all([
16+
embedQuery(search_query),
17+
getDocGroupsForVectorSearch(course_name),
18+
])
2619

2720
return vectorSearchWithDrizzle({
2821
queryEmbedding: embedding,
2922
course_name,
3023
doc_groups,
31-
disabled_doc_groups: disabled_doc_groups ?? [],
32-
public_doc_groups: public_doc_groups ?? [],
24+
disabled_doc_groups,
25+
public_doc_groups,
3326
conversation_id,
3427
top_n,
3528
})

src/utils/__tests__/vectorUtils.test.ts

Lines changed: 17 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import { describe, expect, it, vi } from 'vitest'
22

3+
const mockWhere = vi.fn().mockResolvedValue(undefined)
4+
const mockSet = vi.fn().mockReturnValue({ where: mockWhere })
5+
6+
vi.mock('~/db/dbClient', () => ({
7+
db: {
8+
update: () => ({ set: mockSet }),
9+
},
10+
}))
11+
312
describe('vectorUtils', () => {
4-
it('calls backend update-doc-groups and returns response with status completed', async () => {
5-
vi.stubEnv('RAILWAY_URL', 'https://backend.example')
6-
const fetchMock = vi.fn().mockResolvedValue({
7-
ok: true,
8-
json: async () => ({ status: 'completed' }),
9-
})
10-
vi.stubGlobal('fetch', fetchMock)
11-
12-
vi.resetModules()
13+
it('updates embeddings via Drizzle and returns status completed', async () => {
1314
const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
1415

1516
const doc = {
@@ -21,30 +22,14 @@ describe('vectorUtils', () => {
2122

2223
const result = await updateDocGroupsInVectorStore('CS101', doc)
2324
expect(result).toEqual({ status: 'completed' })
24-
expect(fetchMock).toHaveBeenCalledWith(
25-
'https://backend.example/update-doc-groups',
25+
expect(mockSet).toHaveBeenCalledWith(
2626
expect.objectContaining({
27-
method: 'POST',
28-
headers: { 'Content-Type': 'application/json' },
29-
body: JSON.stringify({
30-
courseName: 'CS101',
31-
s3_path: 's3://bucket/key',
32-
url: 'https://example.com/doc',
33-
doc_groups: ['g1', 'g2'],
34-
}),
27+
doc_groups: ['g1', 'g2'],
3528
}),
3629
)
3730
})
3831

3932
it('sends empty url and s3_path when missing', async () => {
40-
vi.stubEnv('RAILWAY_URL', 'https://backend.example')
41-
const fetchMock = vi.fn().mockResolvedValue({
42-
ok: true,
43-
json: async () => ({ status: 'completed' }),
44-
})
45-
vi.stubGlobal('fetch', fetchMock)
46-
47-
vi.resetModules()
4833
const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
4934

5035
const doc = {
@@ -54,29 +39,14 @@ describe('vectorUtils', () => {
5439
readable_filename: 'doc.pdf',
5540
} as any
5641

57-
await updateDocGroupsInVectorStore('CS101', doc)
58-
expect(fetchMock).toHaveBeenCalledWith(
59-
expect.any(String),
60-
expect.objectContaining({
61-
body: JSON.stringify({
62-
courseName: 'CS101',
63-
s3_path: '',
64-
url: '',
65-
doc_groups: ['g1'],
66-
}),
67-
}),
68-
)
42+
const result = await updateDocGroupsInVectorStore('CS101', doc)
43+
expect(result).toEqual({ status: 'completed' })
6944
})
7045

7146
it('captures posthog event and rethrows on error', async () => {
72-
vi.stubEnv('RAILWAY_URL', 'https://backend.example')
7347
vi.spyOn(console, 'error').mockImplementation(() => {})
74-
const fetchMock = vi
75-
.fn()
76-
.mockResolvedValue({ ok: false, text: async () => 'boom' })
77-
vi.stubGlobal('fetch', fetchMock)
48+
mockWhere.mockRejectedValueOnce(new Error('boom'))
7849

79-
vi.resetModules()
8050
const posthog = (await import('posthog-js')).default as any
8151
vi.spyOn(posthog, 'capture')
8252
const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
@@ -102,14 +72,9 @@ describe('vectorUtils', () => {
10272
})
10373

10474
it('sets doc_unique_identifier to null when url and s3_path are empty', async () => {
105-
vi.stubEnv('RAILWAY_URL', 'https://backend.example')
10675
vi.spyOn(console, 'error').mockImplementation(() => {})
107-
const fetchMock = vi
108-
.fn()
109-
.mockResolvedValue({ ok: false, text: async () => 'err' })
110-
vi.stubGlobal('fetch', fetchMock)
76+
mockWhere.mockRejectedValueOnce(new Error('err'))
11177

112-
vi.resetModules()
11378
const posthog = (await import('posthog-js')).default as any
11479
vi.spyOn(posthog, 'capture')
11580
const { updateDocGroupsInVectorStore } = await import('../vectorUtils')
@@ -129,14 +94,9 @@ describe('vectorUtils', () => {
12994
})
13095

13196
it('sets doc_unique_identifier to url when url is present', async () => {
132-
vi.stubEnv('RAILWAY_URL', 'https://backend.example')
13397
vi.spyOn(console, 'error').mockImplementation(() => {})
134-
const fetchMock = vi
135-
.fn()
136-
.mockResolvedValue({ ok: false, text: async () => 'err' })
137-
vi.stubGlobal('fetch', fetchMock)
98+
mockWhere.mockRejectedValueOnce(new Error('err'))
13899

139-
vi.resetModules()
140100
const posthog = (await import('posthog-js')).default as any
141101
vi.spyOn(posthog, 'capture')
142102
const { updateDocGroupsInVectorStore } = await import('../vectorUtils')

src/utils/embedQuery.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/**
2+
* Server-side query embedding for vector search.
3+
* Replaces backend /embedAndMetadata for the embedding part.
4+
* Uses same env and behavior as backend: EMBEDDING_MODEL, OPENAI_API_KEY/NCSA_HOSTED_API_KEY,
5+
* EMBEDDING_API_BASE, optional QWEN_QUERY_INSTRUCTION for Qwen models.
6+
*/
7+
8+
import OpenAI from 'openai'
9+
10+
const DEFAULT_QWEN_QUERY_INSTRUCTION =
11+
'Given a user search query, retrieve the most relevant passages from the Illinois Chat knowledge base stored in the vector store to answer the query accurately. Prioritize authoritative course materials, syllabi, FAQs, official documentation, web pages, and other relevant sources. Ignore boilerplate/navigation text.'
12+
13+
function getOpenAIClient(): OpenAI {
14+
const apiKey =
15+
process.env.OPENAI_API_KEY || process.env.NCSA_HOSTED_API_KEY || ''
16+
const baseURL = process.env.EMBEDDING_API_BASE || 'https://api.openai.com/v1'
17+
return new OpenAI({ apiKey, baseURL })
18+
}
19+
20+
/**
21+
* Generate embedding for a search query. Uses EMBEDDING_MODEL (default text-embedding-ada-002).
22+
* For Qwen models, prefixes the query with QWEN_QUERY_INSTRUCTION when set.
23+
*/
24+
export async function embedQuery(searchQuery: string): Promise<number[]> {
25+
const model = process.env.EMBEDDING_MODEL || 'text-embedding-ada-002'
26+
const qwenInstruction =
27+
process.env.QWEN_QUERY_INSTRUCTION || DEFAULT_QWEN_QUERY_INSTRUCTION
28+
29+
let input = searchQuery.replace(/\n/g, ' ').trim()
30+
if (
31+
qwenInstruction &&
32+
typeof model === 'string' &&
33+
model.toLowerCase().includes('qwen')
34+
) {
35+
input = `Instruct: ${qwenInstruction}\nQuery:${searchQuery}`
36+
}
37+
38+
const openai = getOpenAIClient()
39+
const {
40+
data: [result],
41+
} = await openai.embeddings.create({
42+
model,
43+
input,
44+
})
45+
46+
if (!result?.embedding) {
47+
throw new Error('No embedding returned from embedding API')
48+
}
49+
return result.embedding
50+
}

0 commit comments

Comments
 (0)