Skip to content

Commit 902e583

Browse files
committed
add: chunking store page ui
1 parent d6c8783 commit 902e583

5 files changed

Lines changed: 881 additions & 13 deletions

File tree

src/renderer/services/chunking.ts

Lines changed: 390 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,390 @@
1+
/**
2+
* Chunking Service
3+
* Handles document chunking with various strategies for RAG pipelines
4+
*/
5+
6+
export type ChunkingStrategy =
7+
| "fixed-size"
8+
| "sentence"
9+
| "paragraph"
10+
| "recursive";
11+
12+
export interface ChunkingConfig {
13+
strategy: ChunkingStrategy;
14+
chunkSize: number; // in characters
15+
overlap: number; // in characters
16+
separators?: string[]; // for recursive strategy
17+
}
18+
19+
export interface DocumentChunk {
20+
id: string;
21+
content: string;
22+
index: number;
23+
startChar: number;
24+
endChar: number;
25+
metadata?: {
26+
documentId: string;
27+
documentName: string;
28+
};
29+
}
30+
31+
class ChunkingService {
32+
/**
33+
* Default chunking configuration
34+
*/
35+
private defaultConfig: ChunkingConfig = {
36+
strategy: "fixed-size",
37+
chunkSize: 1000,
38+
overlap: 100,
39+
separators: ["\n\n", "\n", ". ", " ", ""],
40+
};
41+
42+
/**
43+
* Chunk a document using the specified strategy
44+
*/
45+
chunkDocument(
46+
content: string,
47+
documentId: string,
48+
documentName: string,
49+
config: Partial<ChunkingConfig> = {}
50+
): DocumentChunk[] {
51+
const finalConfig = { ...this.defaultConfig, ...config };
52+
53+
switch (finalConfig.strategy) {
54+
case "fixed-size":
55+
return this.fixedSizeChunking(content, documentId, documentName, finalConfig);
56+
case "sentence":
57+
return this.sentenceChunking(content, documentId, documentName, finalConfig);
58+
case "paragraph":
59+
return this.paragraphChunking(content, documentId, documentName, finalConfig);
60+
case "recursive":
61+
return this.recursiveChunking(content, documentId, documentName, finalConfig);
62+
default:
63+
return this.fixedSizeChunking(content, documentId, documentName, finalConfig);
64+
}
65+
}
66+
67+
/**
68+
* Fixed-size chunking with overlap
69+
*/
70+
private fixedSizeChunking(
71+
content: string,
72+
documentId: string,
73+
documentName: string,
74+
config: ChunkingConfig
75+
): DocumentChunk[] {
76+
const chunks: DocumentChunk[] = [];
77+
const { chunkSize, overlap } = config;
78+
let index = 0;
79+
let position = 0;
80+
81+
while (position < content.length) {
82+
const end = Math.min(position + chunkSize, content.length);
83+
const chunkContent = content.slice(position, end);
84+
85+
chunks.push({
86+
id: `${documentId}-chunk-${index}`,
87+
content: chunkContent,
88+
index,
89+
startChar: position,
90+
endChar: end,
91+
metadata: {
92+
documentId,
93+
documentName,
94+
},
95+
});
96+
97+
position += chunkSize - overlap;
98+
index++;
99+
100+
// Prevent infinite loop
101+
if (position >= content.length) break;
102+
}
103+
104+
return chunks;
105+
}
106+
107+
/**
108+
* Sentence-level chunking
109+
*/
110+
private sentenceChunking(
111+
content: string,
112+
documentId: string,
113+
documentName: string,
114+
config: ChunkingConfig
115+
): DocumentChunk[] {
116+
const chunks: DocumentChunk[] = [];
117+
118+
// Split by sentence boundaries (., !, ?)
119+
const sentenceRegex = /[.!?]+\s+/g;
120+
const sentences: string[] = [];
121+
let lastIndex = 0;
122+
let match;
123+
124+
while ((match = sentenceRegex.exec(content)) !== null) {
125+
sentences.push(content.slice(lastIndex, match.index + match[0].length).trim());
126+
lastIndex = match.index + match[0].length;
127+
}
128+
129+
// Add remaining content
130+
if (lastIndex < content.length) {
131+
sentences.push(content.slice(lastIndex).trim());
132+
}
133+
134+
// Group sentences into chunks based on chunkSize
135+
let currentChunk = "";
136+
let chunkStartChar = 0;
137+
let index = 0;
138+
139+
for (let i = 0; i < sentences.length; i++) {
140+
const sentence = sentences[i];
141+
142+
if (currentChunk.length + sentence.length > config.chunkSize && currentChunk.length > 0) {
143+
// Save current chunk
144+
chunks.push({
145+
id: `${documentId}-chunk-${index}`,
146+
content: currentChunk.trim(),
147+
index,
148+
startChar: chunkStartChar,
149+
endChar: chunkStartChar + currentChunk.length,
150+
metadata: {
151+
documentId,
152+
documentName,
153+
},
154+
});
155+
156+
// Start new chunk with overlap (include last sentence)
157+
const overlapSentences = this.getOverlapSentences(sentences, i, config.overlap);
158+
currentChunk = overlapSentences + sentence + " ";
159+
chunkStartChar = chunkStartChar + currentChunk.length - overlapSentences.length - sentence.length - 1;
160+
index++;
161+
} else {
162+
currentChunk += sentence + " ";
163+
}
164+
}
165+
166+
// Add final chunk
167+
if (currentChunk.trim().length > 0) {
168+
chunks.push({
169+
id: `${documentId}-chunk-${index}`,
170+
content: currentChunk.trim(),
171+
index,
172+
startChar: chunkStartChar,
173+
endChar: chunkStartChar + currentChunk.length,
174+
metadata: {
175+
documentId,
176+
documentName,
177+
},
178+
});
179+
}
180+
181+
return chunks;
182+
}
183+
184+
/**
185+
* Paragraph-level chunking
186+
*/
187+
private paragraphChunking(
188+
content: string,
189+
documentId: string,
190+
documentName: string,
191+
config: ChunkingConfig
192+
): DocumentChunk[] {
193+
const chunks: DocumentChunk[] = [];
194+
195+
// Split by double newlines (paragraphs)
196+
const paragraphs = content.split(/\n\n+/).filter(p => p.trim().length > 0);
197+
198+
let currentChunk = "";
199+
let chunkStartChar = 0;
200+
let index = 0;
201+
let currentPosition = 0;
202+
203+
for (let i = 0; i < paragraphs.length; i++) {
204+
const paragraph = paragraphs[i].trim();
205+
206+
if (currentChunk.length + paragraph.length > config.chunkSize && currentChunk.length > 0) {
207+
// Save current chunk
208+
chunks.push({
209+
id: `${documentId}-chunk-${index}`,
210+
content: currentChunk.trim(),
211+
index,
212+
startChar: chunkStartChar,
213+
endChar: currentPosition,
214+
metadata: {
215+
documentId,
216+
documentName,
217+
},
218+
});
219+
220+
// Start new chunk
221+
currentChunk = paragraph + "\n\n";
222+
chunkStartChar = currentPosition;
223+
index++;
224+
} else {
225+
currentChunk += paragraph + "\n\n";
226+
}
227+
228+
currentPosition += paragraph.length + 2; // +2 for \n\n
229+
}
230+
231+
// Add final chunk
232+
if (currentChunk.trim().length > 0) {
233+
chunks.push({
234+
id: `${documentId}-chunk-${index}`,
235+
content: currentChunk.trim(),
236+
index,
237+
startChar: chunkStartChar,
238+
endChar: currentPosition,
239+
metadata: {
240+
documentId,
241+
documentName,
242+
},
243+
});
244+
}
245+
246+
return chunks;
247+
}
248+
249+
/**
250+
* Recursive chunking with hierarchical separators
251+
*/
252+
private recursiveChunking(
253+
content: string,
254+
documentId: string,
255+
documentName: string,
256+
config: ChunkingConfig
257+
): DocumentChunk[] {
258+
const chunks: DocumentChunk[] = [];
259+
const separators = config.separators || ["\n\n", "\n", ". ", " ", ""];
260+
261+
const splitRecursive = (
262+
text: string,
263+
startChar: number,
264+
sepIndex: number = 0
265+
): string[] => {
266+
if (text.length <= config.chunkSize) {
267+
return [text];
268+
}
269+
270+
if (sepIndex >= separators.length) {
271+
// Fallback to character-level split
272+
return this.splitBySize(text, config.chunkSize);
273+
}
274+
275+
const separator = separators[sepIndex];
276+
const splits = text.split(separator);
277+
const result: string[] = [];
278+
let currentChunk = "";
279+
280+
for (let i = 0; i < splits.length; i++) {
281+
const piece = splits[i] + (i < splits.length - 1 ? separator : "");
282+
283+
if (currentChunk.length + piece.length <= config.chunkSize) {
284+
currentChunk += piece;
285+
} else {
286+
if (currentChunk.length > 0) {
287+
result.push(currentChunk);
288+
}
289+
290+
if (piece.length > config.chunkSize) {
291+
// Piece is too large, recurse with next separator
292+
result.push(...splitRecursive(piece, startChar + currentChunk.length, sepIndex + 1));
293+
currentChunk = "";
294+
} else {
295+
currentChunk = piece;
296+
}
297+
}
298+
}
299+
300+
if (currentChunk.length > 0) {
301+
result.push(currentChunk);
302+
}
303+
304+
return result;
305+
};
306+
307+
const splitTexts = splitRecursive(content, 0);
308+
let position = 0;
309+
310+
splitTexts.forEach((text, index) => {
311+
chunks.push({
312+
id: `${documentId}-chunk-${index}`,
313+
content: text.trim(),
314+
index,
315+
startChar: position,
316+
endChar: position + text.length,
317+
metadata: {
318+
documentId,
319+
documentName,
320+
},
321+
});
322+
323+
position += text.length;
324+
});
325+
326+
return chunks;
327+
}
328+
329+
/**
330+
* Helper: Split text by size (character-level fallback)
331+
*/
332+
private splitBySize(text: string, size: number): string[] {
333+
const chunks: string[] = [];
334+
for (let i = 0; i < text.length; i += size) {
335+
chunks.push(text.slice(i, i + size));
336+
}
337+
return chunks;
338+
}
339+
340+
/**
341+
* Helper: Get overlap sentences for context
342+
*/
343+
private getOverlapSentences(
344+
sentences: string[],
345+
currentIndex: number,
346+
overlapSize: number
347+
): string {
348+
let overlap = "";
349+
let overlapLength = 0;
350+
351+
for (let i = currentIndex - 1; i >= 0; i--) {
352+
if (overlapLength + sentences[i].length > overlapSize) break;
353+
overlap = sentences[i] + " " + overlap;
354+
overlapLength += sentences[i].length + 1;
355+
}
356+
357+
return overlap;
358+
}
359+
360+
/**
361+
* Get statistics about chunks
362+
*/
363+
getChunkStats(chunks: DocumentChunk[]): {
364+
totalChunks: number;
365+
avgChunkSize: number;
366+
minChunkSize: number;
367+
maxChunkSize: number;
368+
} {
369+
if (chunks.length === 0) {
370+
return {
371+
totalChunks: 0,
372+
avgChunkSize: 0,
373+
minChunkSize: 0,
374+
maxChunkSize: 0,
375+
};
376+
}
377+
378+
const sizes = chunks.map(c => c.content.length);
379+
380+
return {
381+
totalChunks: chunks.length,
382+
avgChunkSize: Math.round(sizes.reduce((a, b) => a + b, 0) / sizes.length),
383+
minChunkSize: Math.min(...sizes),
384+
maxChunkSize: Math.max(...sizes),
385+
};
386+
}
387+
}
388+
389+
export const chunkingService = new ChunkingService();
390+

0 commit comments

Comments
 (0)