feat: implement rag in serverless

LogCreative · LogCreative · commit 1fc7f0929552 · 2025-10-07T00:53:50.000+08:00
diff --git a/deploy/init_kb_serverless.py b/deploy/init_kb_serverless.py
@@ -0,0 +1,52 @@
+"""
+Initialize Cloudflare vector database.
+After initializing the vector database in init_kb.py, run this script to dump the vectorized result to jsonl file for wrangler vector insert.
+
+https://developers.cloudflare.com/vectorize/best-practices/insert-vectors/
+Target jsonl line: {id: <node_id>, values: <embedding>, metadata: {text: <text>, ...metadata_}}
+"""
+from config import config
+import psycopg2
+import json
+import os
+import subprocess
+
+print("Clearing existing Cloudflare vector store...")
+subprocess.run(["npx", "wrangler", "vectorize", "delete", "ppedt-embed", "--force"], cwd="ppedt-serverless")
+print("Creating new Cloudflare vector store, press 'n' if prompted to confirm...")
+subprocess.run(["npx", "wrangler", "vectorize", "create", "ppedt-embed", "--preset", "@cf/baai/bge-small-en-v1.5"], cwd="ppedt-serverless", check=True)
+
+
+print("Dumping vectors from Postgres...")
+conn = psycopg2.connect(config.POSTGRES_URI, dbname="ppedt")
+conn.autocommit = True
+
+# create table public.data_embed (
+#   id bigint primary key not null default nextval('data_embed_id_seq'::regclass),
+#   text character varying not null,
+#   metadata_ json,
+#   node_id character varying,
+#   embedding vector(384)
+# );
+# create index embed_idx_1 on data_embed using btree (((metadata_ ->> 'ref_doc_id'::text)));
+# create index data_embed_embedding_idx on data_embed using hnsw (embedding);
+
+with conn.cursor() as c:
+    c.execute("SELECT id, text, metadata_ -> 'file_name', node_id, embedding FROM data_embed;")
+    with open(os.path.join("ppedt-serverless", "data_embed.jsonl"), "w", encoding="utf-8") as f:
+        for row in c.fetchall():
+            id, text, file_name, node_id, embedding = row
+            embedding_list = eval(embedding)  # convert to list for JSON serialization
+            json_line = {
+                "id": str(node_id),
+                "values": embedding_list,
+                "metadata": {
+                    "text": text,
+                    "file_name": file_name
+                }
+            }
+            f.write(json.dumps(json_line) + "\n")
+
+
+print("Inserting vectors to Cloudflare vector store...")
+subprocess.run(["npx", "wrangler", "vectorize", "insert", "ppedt-embed", "--file", "data_embed.jsonl"], cwd="ppedt-serverless", check=True)
diff --git a/deploy/ppedt-serverless/.gitignore b/deploy/ppedt-serverless/.gitignore
@@ -166,4 +166,6 @@ dist
 !.env.example
 .wrangler/
 
-package-lock.json
+package-lock.json
+# Dumped embedding data
+data_embed.jsonl
diff --git a/deploy/ppedt-serverless/src/index.js b/deploy/ppedt-serverless/src/index.js
@@ -23,10 +23,49 @@ export default {
         if (request.method === "POST") {
             const { code, prompt } = JSON.parse(await request.text());
 
-            // messages - chat style input
-            let messages = [
+            // If prompt has more than ascii alphanumeric characters, ask LLM to rewrite it.
+            let final_prompt = prompt;
+            if (/[^\x00-\x7F]+/.test(prompt)) {
+                const response = await env.AI.run(
+                '@cf/meta/llama-3-8b-instruct',
+                {
+                    messages: [
+                        { role: 'system', content: 'Translate the query to English and modify mathematics unicode symbols to LaTeX commands if necessary without any explanation.' },
+                        { role: 'user', content: prompt }
+                    ]
+                });
+                final_prompt = await response.response;
+            }
+
+            // Get the embedding of final_prompt
+            const embedding_response = await env.AI.run(
+                '@cf/baai/bge-small-en-v1.5',
+                {
+                    text: final_prompt
+                });
+            const query_vector = await embedding_response.data[0];
+
+            // Retrieve context from Cloudflare embedding DB
+            let matches = await env.VECTORIZE.query(query_vector, {
+                topK: 3,
+                returnMetadata: 'all'
+            });
+            matches = matches.matches;
+            // make a cutoff of score < 0.75
+            matches = matches.filter(m => m.score >= 0.75);
+            const context_str = matches.map(m => "File: " + m.metadata.file_name + "\n" + m.metadata.text).join('\n\n');
+            final_prompt = "Context information is below.\n" +
+                "---------------------\n" +
+                context_str + "\n" +
+                "---------------------\n" +
+                "Answer the query.\n" +
+                "Query: " + final_prompt + "\n" +
+                "Answer: ";
+
+            // Final Stream output
+            const messages = [
                 { role: 'system', content: 'You are a LaTeX code helper, especially for the code of package pgfplots. Return only the modified version of the following code without any additional text or explanation. You have to make sure the code could compile successfully and don\'t omit the code of documentclass.' },
-                { role: 'user', content: prompt + ':\n' + code }
+                { role: 'user', content: final_prompt + ':\n' + code }
             ];
             const response = await env.AI.run(
                 '@cf/meta/llama-3-8b-instruct',
diff --git a/deploy/ppedt-serverless/wrangler.jsonc b/deploy/ppedt-serverless/wrangler.jsonc
@@ -38,6 +38,13 @@
 		"directory": "./public/",
 		"binding": "ASSETS"
 	},
+	"vectorize": [
+		{
+			"binding": "VECTORIZE",
+			"index_name": "ppedt-embed",
+			"remote": true
+		}
+	],
 	"ai": {
 		"binding": "AI",
 	}