iBz-04
diff --git a/‎src/renderer/services/chunking.ts‎
Lines changed: 390 additions & 0 deletions b/‎src/renderer/services/chunking.ts‎
Lines changed: 390 additions & 0 deletions
@@ -0,0 +1,390 @@
+/**
+ * Chunking Service
+ * Handles document chunking with various strategies for RAG pipelines
+ */
+
+export type ChunkingStrategy = 
+  | "fixed-size" 
+  | "sentence" 
+  | "paragraph" 
+  | "recursive";
+
+export interface ChunkingConfig {
+  strategy: ChunkingStrategy;
+  chunkSize: number; // in characters
+  overlap: number; // in characters
+  separators?: string[]; // for recursive strategy
+}
+
+export interface DocumentChunk {
+  id: string;
+  content: string;
+  index: number;
+  startChar: number;
+  endChar: number;
+  metadata?: {
+    documentId: string;
+    documentName: string;
+  };
+}
+
+class ChunkingService {
+  /**
+   * Default chunking configuration
+   */
+  private defaultConfig: ChunkingConfig = {
+    strategy: "fixed-size",
+    chunkSize: 1000,
+    overlap: 100,
+    separators: ["\n\n", "\n", ". ", " ", ""],
+  };
+
+  /**
+   * Chunk a document using the specified strategy
+   */
+  chunkDocument(
+    content: string,
+    documentId: string,
+    documentName: string,
+    config: Partial<ChunkingConfig> = {}
+  ): DocumentChunk[] {
+    const finalConfig = { ...this.defaultConfig, ...config };
+
+    switch (finalConfig.strategy) {
+      case "fixed-size":
+        return this.fixedSizeChunking(content, documentId, documentName, finalConfig);
+      case "sentence":
+        return this.sentenceChunking(content, documentId, documentName, finalConfig);
+      case "paragraph":
+        return this.paragraphChunking(content, documentId, documentName, finalConfig);
+      case "recursive":
+        return this.recursiveChunking(content, documentId, documentName, finalConfig);
+      default:
+        return this.fixedSizeChunking(content, documentId, documentName, finalConfig);
+    }
+  }
+
+  /**
+   * Fixed-size chunking with overlap
+   */
+  private fixedSizeChunking(
+    content: string,
+    documentId: string,
+    documentName: string,
+    config: ChunkingConfig
+  ): DocumentChunk[] {
+    const chunks: DocumentChunk[] = [];
+    const { chunkSize, overlap } = config;
+    let index = 0;
+    let position = 0;
+
+    while (position < content.length) {
+      const end = Math.min(position + chunkSize, content.length);
+      const chunkContent = content.slice(position, end);
+
+      chunks.push({
+        id: `${documentId}-chunk-${index}`,
+        content: chunkContent,
+        index,
+        startChar: position,
+        endChar: end,
+        metadata: {
+          documentId,
+          documentName,
+        },
+      });
+
+      position += chunkSize - overlap;
+      index++;
+
+      // Prevent infinite loop
+      if (position >= content.length) break;
+    }
+
+    return chunks;
+  }
+
+  /**
+   * Sentence-level chunking
+   */
+  private sentenceChunking(
+    content: string,
+    documentId: string,
+    documentName: string,
+    config: ChunkingConfig
+  ): DocumentChunk[] {
+    const chunks: DocumentChunk[] = [];
+    
+    // Split by sentence boundaries (., !, ?)
+    const sentenceRegex = /[.!?]+\s+/g;
+    const sentences: string[] = [];
+    let lastIndex = 0;
+    let match;
+
+    while ((match = sentenceRegex.exec(content)) !== null) {
+      sentences.push(content.slice(lastIndex, match.index + match[0].length).trim());
+      lastIndex = match.index + match[0].length;
+    }
+    
+    // Add remaining content
+    if (lastIndex < content.length) {
+      sentences.push(content.slice(lastIndex).trim());
+    }
+
+    // Group sentences into chunks based on chunkSize
+    let currentChunk = "";
+    let chunkStartChar = 0;
+    let index = 0;
+
+    for (let i = 0; i < sentences.length; i++) {
+      const sentence = sentences[i];
+      
+      if (currentChunk.length + sentence.length > config.chunkSize && currentChunk.length > 0) {
+        // Save current chunk
+        chunks.push({
+          id: `${documentId}-chunk-${index}`,
+          content: currentChunk.trim(),
+          index,
+          startChar: chunkStartChar,
+          endChar: chunkStartChar + currentChunk.length,
+          metadata: {
+            documentId,
+            documentName,
+          },
+        });
+
+        // Start new chunk with overlap (include last sentence)
+        const overlapSentences = this.getOverlapSentences(sentences, i, config.overlap);
+        currentChunk = overlapSentences + sentence + " ";
+        chunkStartChar = chunkStartChar + currentChunk.length - overlapSentences.length - sentence.length - 1;
+        index++;
+      } else {
+        currentChunk += sentence + " ";
+      }
+    }
+
+    // Add final chunk
+    if (currentChunk.trim().length > 0) {
+      chunks.push({
+        id: `${documentId}-chunk-${index}`,
+        content: currentChunk.trim(),
+        index,
+        startChar: chunkStartChar,
+        endChar: chunkStartChar + currentChunk.length,
+        metadata: {
+          documentId,
+          documentName,
+        },
+      });
+    }
+
+    return chunks;
+  }
+
+  /**
+   * Paragraph-level chunking
+   */
+  private paragraphChunking(
+    content: string,
+    documentId: string,
+    documentName: string,
+    config: ChunkingConfig
+  ): DocumentChunk[] {
+    const chunks: DocumentChunk[] = [];
+    
+    // Split by double newlines (paragraphs)
+    const paragraphs = content.split(/\n\n+/).filter(p => p.trim().length > 0);
+    
+    let currentChunk = "";
+    let chunkStartChar = 0;
+    let index = 0;
+    let currentPosition = 0;
+
+    for (let i = 0; i < paragraphs.length; i++) {
+      const paragraph = paragraphs[i].trim();
+      
+      if (currentChunk.length + paragraph.length > config.chunkSize && currentChunk.length > 0) {
+        // Save current chunk
+        chunks.push({
+          id: `${documentId}-chunk-${index}`,
+          content: currentChunk.trim(),
+          index,
+          startChar: chunkStartChar,
+          endChar: currentPosition,
+          metadata: {
+            documentId,
+            documentName,
+          },
+        });
+
+        // Start new chunk
+        currentChunk = paragraph + "\n\n";
+        chunkStartChar = currentPosition;
+        index++;
+      } else {
+        currentChunk += paragraph + "\n\n";
+      }
+
+      currentPosition += paragraph.length + 2; // +2 for \n\n
+    }
+
+    // Add final chunk
+    if (currentChunk.trim().length > 0) {
+      chunks.push({
+        id: `${documentId}-chunk-${index}`,
+        content: currentChunk.trim(),
+        index,
+        startChar: chunkStartChar,
+        endChar: currentPosition,
+        metadata: {
+          documentId,
+          documentName,
+        },
+      });
+    }
+
+    return chunks;
+  }
+
+  /**
+   * Recursive chunking with hierarchical separators
+   */
+  private recursiveChunking(
+    content: string,
+    documentId: string,
+    documentName: string,
+    config: ChunkingConfig
+  ): DocumentChunk[] {
+    const chunks: DocumentChunk[] = [];
+    const separators = config.separators || ["\n\n", "\n", ". ", " ", ""];
+
+    const splitRecursive = (
+      text: string,
+      startChar: number,
+      sepIndex: number = 0
+    ): string[] => {
+      if (text.length <= config.chunkSize) {
+        return [text];
+      }
+
+      if (sepIndex >= separators.length) {
+        // Fallback to character-level split
+        return this.splitBySize(text, config.chunkSize);
+      }
+
+      const separator = separators[sepIndex];
+      const splits = text.split(separator);
+      const result: string[] = [];
+      let currentChunk = "";
+
+      for (let i = 0; i < splits.length; i++) {
+        const piece = splits[i] + (i < splits.length - 1 ? separator : "");
+
+        if (currentChunk.length + piece.length <= config.chunkSize) {
+          currentChunk += piece;
+        } else {
+          if (currentChunk.length > 0) {
+            result.push(currentChunk);
+          }
+
+          if (piece.length > config.chunkSize) {
+            // Piece is too large, recurse with next separator
+            result.push(...splitRecursive(piece, startChar + currentChunk.length, sepIndex + 1));
+            currentChunk = "";
+          } else {
+            currentChunk = piece;
+          }
+        }
+      }
+
+      if (currentChunk.length > 0) {
+        result.push(currentChunk);
+      }
+
+      return result;
+    };
+
+    const splitTexts = splitRecursive(content, 0);
+    let position = 0;
+
+    splitTexts.forEach((text, index) => {
+      chunks.push({
+        id: `${documentId}-chunk-${index}`,
+        content: text.trim(),
+        index,
+        startChar: position,
+        endChar: position + text.length,
+        metadata: {
+          documentId,
+          documentName,
+        },
+      });
+
+      position += text.length;
+    });
+
+    return chunks;
+  }
+
+  /**
+   * Helper: Split text by size (character-level fallback)
+   */
+  private splitBySize(text: string, size: number): string[] {
+    const chunks: string[] = [];
+    for (let i = 0; i < text.length; i += size) {
+      chunks.push(text.slice(i, i + size));
+    }
+    return chunks;
+  }
+
+  /**
+   * Helper: Get overlap sentences for context
+   */
+  private getOverlapSentences(
+    sentences: string[],
+    currentIndex: number,
+    overlapSize: number
+  ): string {
+    let overlap = "";
+    let overlapLength = 0;
+    
+    for (let i = currentIndex - 1; i >= 0; i--) {
+      if (overlapLength + sentences[i].length > overlapSize) break;
+      overlap = sentences[i] + " " + overlap;
+      overlapLength += sentences[i].length + 1;
+    }
+
+    return overlap;
+  }
+
+  /**
+   * Get statistics about chunks
+   */
+  getChunkStats(chunks: DocumentChunk[]): {
+    totalChunks: number;
+    avgChunkSize: number;
+    minChunkSize: number;
+    maxChunkSize: number;
+  } {
+    if (chunks.length === 0) {
+      return {
+        totalChunks: 0,
+        avgChunkSize: 0,
+        minChunkSize: 0,
+        maxChunkSize: 0,
+      };
+    }
+
+    const sizes = chunks.map(c => c.content.length);
+    
+    return {
+      totalChunks: chunks.length,
+      avgChunkSize: Math.round(sizes.reduce((a, b) => a + b, 0) / sizes.length),
+      minChunkSize: Math.min(...sizes),
+      maxChunkSize: Math.max(...sizes),
+    };
+  }
+}
+
+export const chunkingService = new ChunkingService();
+