|
| 1 | +""" |
| 2 | +Initialize Cloudflare vector database. |
| 3 | +After initializing the vector database in init_kb.py, run this script to dump the vectorized result to jsonl file for wrangler vector insert. |
| 4 | +
|
| 5 | +https://developers.cloudflare.com/vectorize/best-practices/insert-vectors/ |
| 6 | +Target jsonl line: {id: <node_id>, values: <embedding>, metadata: {text: <text>, ...metadata_}} |
| 7 | +""" |
| 8 | +from config import config |
| 9 | +import psycopg2 |
| 10 | +import json |
| 11 | +import os |
| 12 | +import subprocess |
| 13 | + |
| 14 | +print("Clearing existing Cloudflare vector store...") |
| 15 | +subprocess.run(["npx", "wrangler", "vectorize", "delete", "ppedt-embed", "--force"], cwd="ppedt-serverless") |
| 16 | +print("Creating new Cloudflare vector store, press 'n' if prompted to confirm...") |
| 17 | +subprocess.run(["npx", "wrangler", "vectorize", "create", "ppedt-embed", "--preset", "@cf/baai/bge-small-en-v1.5"], cwd="ppedt-serverless", check=True) |
| 18 | + |
| 19 | + |
| 20 | +print("Dumping vectors from Postgres...") |
| 21 | +conn = psycopg2.connect(config.POSTGRES_URI, dbname="ppedt") |
| 22 | +conn.autocommit = True |
| 23 | + |
| 24 | +# create table public.data_embed ( |
| 25 | +# id bigint primary key not null default nextval('data_embed_id_seq'::regclass), |
| 26 | +# text character varying not null, |
| 27 | +# metadata_ json, |
| 28 | +# node_id character varying, |
| 29 | +# embedding vector(384) |
| 30 | +# ); |
| 31 | +# create index embed_idx_1 on data_embed using btree (((metadata_ ->> 'ref_doc_id'::text))); |
| 32 | +# create index data_embed_embedding_idx on data_embed using hnsw (embedding); |
| 33 | + |
| 34 | +with conn.cursor() as c: |
| 35 | + c.execute("SELECT id, text, metadata_ -> 'file_name', node_id, embedding FROM data_embed;") |
| 36 | + with open(os.path.join("ppedt-serverless", "data_embed.jsonl"), "w", encoding="utf-8") as f: |
| 37 | + for row in c.fetchall(): |
| 38 | + id, text, file_name, node_id, embedding = row |
| 39 | + embedding_list = eval(embedding) # convert to list for JSON serialization |
| 40 | + json_line = { |
| 41 | + "id": str(node_id), |
| 42 | + "values": embedding_list, |
| 43 | + "metadata": { |
| 44 | + "text": text, |
| 45 | + "file_name": file_name |
| 46 | + } |
| 47 | + } |
| 48 | + f.write(json.dumps(json_line) + "\n") |
| 49 | + |
| 50 | + |
| 51 | +print("Inserting vectors to Cloudflare vector store...") |
| 52 | +subprocess.run(["npx", "wrangler", "vectorize", "insert", "ppedt-embed", "--file", "data_embed.jsonl"], cwd="ppedt-serverless", check=True) |
0 commit comments