forked from joseairosa/recall
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerator.ts
More file actions
153 lines (130 loc) · 4.28 KB
/
Copy pathgenerator.ts
File metadata and controls
153 lines (130 loc) · 4.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import Anthropic from '@anthropic-ai/sdk';
let anthropicClient: Anthropic | null = null;
function getAnthropicClient(): Anthropic {
if (!anthropicClient) {
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
throw new Error('ANTHROPIC_API_KEY environment variable is required');
}
anthropicClient = new Anthropic({ apiKey });
}
return anthropicClient;
}
// Claude doesn't have a native embeddings API, so we'll use a lightweight approach:
// Generate a semantic "fingerprint" by having Claude extract key concepts
async function generateSemanticFingerprint(text: string): Promise<string[]> {
try {
const client = getAnthropicClient();
const response = await client.messages.create({
model: 'claude-3-5-haiku-20241022', // Fast, cheap model for this task
max_tokens: 200,
messages: [{
role: 'user',
content: `Extract 5-10 key concepts/keywords from this text. Return ONLY a comma-separated list, no explanations:
${text}`
}]
});
const content = response.content[0];
if (content.type === 'text') {
// Parse comma-separated keywords
const keywords = content.text
.split(',')
.map(k => k.trim().toLowerCase())
.filter(k => k.length > 0);
return keywords;
}
return [];
} catch (error) {
console.error('Error generating semantic fingerprint:', error);
throw error;
}
}
// Convert text to a simple vector representation using character n-grams and keywords
export async function generateEmbedding(text: string): Promise<number[]> {
try {
// Get semantic keywords from Claude
const keywords = await generateSemanticFingerprint(text);
// Create a simple vector representation
// This is a lightweight approach that combines:
// 1. Character trigrams (for text similarity)
// 2. Semantic keywords (from Claude)
const vector = createSimpleVector(text, keywords);
return vector;
} catch (error) {
console.error('Error generating embedding:', error);
throw error;
}
}
export async function generateEmbeddings(texts: string[]): Promise<number[][]> {
try {
// Process in parallel
const embeddings = await Promise.all(
texts.map(text => generateEmbedding(text))
);
return embeddings;
} catch (error) {
console.error('Error generating embeddings:', error);
throw error;
}
}
// Create a simple 128-dimensional vector from text and keywords
function createSimpleVector(text: string, keywords: string[]): number[] {
const VECTOR_SIZE = 128;
const vector = new Array(VECTOR_SIZE).fill(0);
// Normalize text
const normalized = text.toLowerCase();
// Part 1: Character trigrams (first 64 dimensions)
const trigrams = extractTrigrams(normalized);
for (let i = 0; i < Math.min(trigrams.length, 64); i++) {
const hash = simpleHash(trigrams[i]);
const index = hash % 64;
vector[index] += 1;
}
// Part 2: Keyword-based features (last 64 dimensions)
for (const keyword of keywords) {
const hash = simpleHash(keyword);
const index = 64 + (hash % 64);
vector[index] += 2; // Weight keywords higher
}
// Normalize the vector
const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
if (magnitude > 0) {
for (let i = 0; i < vector.length; i++) {
vector[i] /= magnitude;
}
}
return vector;
}
// Extract character trigrams from text
function extractTrigrams(text: string): string[] {
const trigrams: string[] = [];
for (let i = 0; i < text.length - 2; i++) {
trigrams.push(text.substring(i, i + 3));
}
return trigrams;
}
// Simple hash function
function simpleHash(str: string): number {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
return Math.abs(hash);
}
// Cosine similarity calculation
export function cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length) {
throw new Error('Vectors must have the same length');
}
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}