-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupload-pdfs.ts
More file actions
156 lines (128 loc) Β· 4.84 KB
/
Copy pathupload-pdfs.ts
File metadata and controls
156 lines (128 loc) Β· 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import { drizzle } from "drizzle-orm/postgres-js";
import postgres from "postgres";
import { google } from "@ai-sdk/google";
import { embedMany } from "ai";
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { chunk } from "./schema";
import dotenv from "dotenv";
import * as fs from "fs";
import * as path from "path";
import pdf from "pdf-parse";
dotenv.config({
path: ".env",
});
// Function to extract text from PDF
async function extractTextFromPDF(pdfPath: string): Promise<string> {
try {
const dataBuffer = fs.readFileSync(pdfPath);
const data = await pdf(dataBuffer);
return data.text;
} catch (error) {
console.error(`Error reading PDF ${pdfPath}:`, error);
throw error;
}
}
// Function to upload PDF content
async function uploadPDFDocument(filename: string, pdfPath: string) {
console.log(`π Processing PDF: ${filename}`);
try {
// Extract text from PDF
console.log(` π Extracting text from PDF...`);
const content = await extractTextFromPDF(pdfPath);
if (!content.trim()) {
console.log(` β οΈ No text found in ${filename}`);
return 0;
}
// Split content into chunks
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
const chunks = await textSplitter.createDocuments([content.trim()]);
console.log(` π Created ${chunks.length} chunks`);
// Generate embeddings
console.log(` π§ Generating embeddings...`);
const { embeddings } = await embedMany({
model: google.textEmbedding("text-embedding-004"),
values: chunks.map((chunk) => chunk.pageContent),
});
console.log(` β
Generated ${embeddings.length} embeddings`);
// Prepare chunks for database insertion
const chunksToInsert = chunks.map((chunkDoc, i) => ({
id: `kca/${filename}/${i}`,
filePath: `kca/${filename}`,
content: chunkDoc.pageContent,
embedding: embeddings[i],
}));
// Insert into database
await db.insert(chunk).values(chunksToInsert);
console.log(` πΎ Inserted ${chunksToInsert.length} chunks into database`);
return chunksToInsert.length;
} catch (error) {
console.error(` β Error processing ${filename}:`, error);
return 0;
}
}
async function uploadKCAPDFs() {
console.log("π Starting KCA University PDF upload...");
// Check environment variables
if (!process.env.POSTGRES_URL) {
console.error("β POSTGRES_URL not found in environment variables");
process.exit(1);
}
if (!process.env.GOOGLE_GENERATIVE_AI_API_KEY ||
process.env.GOOGLE_GENERATIVE_AI_API_KEY === 'placeholder-google-api-key') {
console.error("β GOOGLE_GENERATIVE_AI_API_KEY not configured");
process.exit(1);
}
// Connect to database
const connection = postgres(process.env.POSTGRES_URL, { max: 1 });
const db = drizzle(connection);
console.log("β
Connected to Supabase database");
// Create pdfs directory if it doesn't exist
const pdfsDir = path.join(process.cwd(), 'pdfs');
if (!fs.existsSync(pdfsDir)) {
fs.mkdirSync(pdfsDir);
console.log("π Created 'pdfs' directory");
console.log("π Please add your KCA University PDF files to the 'pdfs' folder and run this script again.");
return;
}
// Get all PDF files from the pdfs directory
const pdfFiles = fs.readdirSync(pdfsDir).filter(file => file.toLowerCase().endsWith('.pdf'));
if (pdfFiles.length === 0) {
console.log("π No PDF files found in 'pdfs' directory.");
console.log("π Please add your KCA University PDF files to the 'pdfs' folder:");
console.log(" β’ Admission requirements");
console.log(" β’ Academic calendar");
console.log(" β’ Student handbook");
console.log(" β’ Fees structure");
console.log(" β’ Course catalog");
return;
}
let totalChunks = 0;
// Process each PDF file
for (const pdfFile of pdfFiles) {
const pdfPath = path.join(pdfsDir, pdfFile);
const filename = path.parse(pdfFile).name; // Remove .pdf extension
const chunksAdded = await uploadPDFDocument(filename, pdfPath);
totalChunks += chunksAdded;
}
await connection.end();
console.log("\nπ KCA University PDF upload completed!");
console.log(`π Summary:`);
console.log(` β’ Processed ${pdfFiles.length} PDF files`);
console.log(` β’ Created ${totalChunks} searchable chunks`);
console.log(` β’ Documents are now accessible to all students`);
console.log("\nπ Students can now ask questions about:");
pdfFiles.forEach(file => {
console.log(` β’ ${path.parse(file).name}`);
});
}
// Connect to database
const connection = postgres(process.env.POSTGRES_URL!, { max: 1 });
const db = drizzle(connection);
// Run the upload
uploadKCAPDFs().catch((error) => {
console.error("β Upload failed:", error);
process.exit(1);
});