VoltAgent · UmeshpJadhav · Dec 26, 2025 · Dec 26, 2025 · Dec 26, 2025 · Dec 26, 2025
diff --git a/.changeset/document-utils-integration.md b/.changeset/document-utils-integration.md
@@ -0,0 +1,6 @@
+---
+"@voltagent/documents": minor
+"@voltagent/core": minor
+---
+
+feat: implement document chunking and embedding utilities and integrate into core
diff --git a/packages/core/package.json b/packages/core/package.json
@@ -16,6 +16,7 @@
     "@opentelemetry/sdk-trace-base": "^2.0.0",
     "@opentelemetry/sdk-trace-node": "^2.0.0",
     "@opentelemetry/semantic-conventions": "^1.28.0",
+    "@voltagent/documents": "workspace:*",
     "@voltagent/internal": "^0.0.12",
     "ts-pattern": "^5.7.1",
     "type-fest": "^4.41.0",
@@ -73,4 +74,4 @@
     "typecheck": "tsc --noEmit"
   },
   "types": "dist/index.d.ts"
-}
+}
diff --git a/packages/core/src/retriever/document-retriever.ts b/packages/core/src/retriever/document-retriever.ts
@@ -0,0 +1,73 @@
+import { DocumentProcessor, type ProcessedDocument } from "@voltagent/documents";
+import type { BaseMessage } from "../agent/providers";
+import { BaseRetriever } from "./retriever";
+import type { RetrieveOptions, RetrieverOptions } from "./types";
+
+export interface DocumentRetrieverOptions extends RetrieverOptions {
+  /**
+   * Optional custom document processor.
+   * If not provided, a default one will be created.
+   */
+  processor?: DocumentProcessor;
+}
+
+/**
+ * Abstract base class for retrievers that handle document ingestion and vector search.
+ */
+export abstract class DocumentRetriever extends BaseRetriever {
+  protected processor: DocumentProcessor;
+
+  constructor(options: DocumentRetrieverOptions = {}) {
+    super(options);
+    this.processor = options.processor || new DocumentProcessor();
+  }
+
+  /**
+   * Ingests text, chunks it, embeds it, and stores it using upsertDocuments.
+   * @param text The raw text to ingest
+   * @param metadata Optional metadata to attach to all chunks
+   */
+  async ingest(text: string, metadata?: Record<string, any>): Promise<void> {
+    this.logger.debug("Ingesting document text", { length: text.length });
+    const documents = await this.processor.process(text, metadata);
+    await this.upsertDocuments(documents);
+    this.logger.debug("Document ingestion complete", { chunks: documents.length });
+  }
+
+  /**
+   * Abstract method to store processed documents in the underlying storage (e.g., Vector DB).
+   * @param documents The processed documents containing embeddings and metadata
+   */
+  abstract upsertDocuments(documents: ProcessedDocument[]): Promise<void>;
+
+  /**
+   * Abstract method to retrieve documents based on a query vector.
+   * This is a helper for the main retrieve method.
+   * @param vector The query vector
+   * @param k Number of results to return
+   */
+  abstract queryVectors(vector: number[], k: number): Promise<ProcessedDocument[]>;
+
+  /**
+   * Default implementation of retrieve that embeds the query and searches vectors.
+   * Can be overridden if needed.
+   */
+  async retrieve(input: string | BaseMessage[], options: RetrieveOptions = {}): Promise<string> {
+    if (Array.isArray(input) && input.length === 0) {
+      return "";
+    }
+    const textQuery = typeof input === "string" ? input : input[input.length - 1].content;
+
+    // We assume the processor's embedder has an embedQuery method.
+    // Since DocumentProcessor exposes 'embedder', we can use it.
+    const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
-    const textQuery = typeof input === "string" ? input : input[input.length - 1].content;
-
-    // We assume the processor's embedder has an embedQuery method.
-    // Since DocumentProcessor exposes 'embedder', we can use it.
-    const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
+    let textQuery: string;
+    if (typeof input === "string") {
+      textQuery = input;
+    } else {
+      const lastContent = input[input.length - 1].content;
+      textQuery = typeof lastContent === "string" ? lastContent : String(lastContent);
+    }
+
+    const queryVector = await this.processor.embedder.embedQuery(textQuery);
-    const textQuery = typeof input === "string" ? input : input[input.length - 1].content;
-
-    // We assume the processor's embedder has an embedQuery method.
-    // Since DocumentProcessor exposes 'embedder', we can use it.
-    const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
+    let textQuery: string;
+    if (typeof input === "string") {
+      textQuery = input;
+    } else {
+      const lastContent = input[input.length - 1].content;
+      textQuery = typeof lastContent === "string" ? lastContent : String(lastContent);
+    }
+
+    const queryVector = await this.processor.embedder.embedQuery(textQuery);
+
+    // Default top-k to 4, can be customizable via options
+    const k = (options as any).k ?? 4;
+
+    const results = await this.queryVectors(queryVector, k);
+
+    // Join the text of the results
+    return results.map((doc) => doc.text).join("\n\n");
+  }
+}
diff --git a/packages/core/src/retriever/index.ts b/packages/core/src/retriever/index.ts
@@ -3,7 +3,10 @@
  * @module retriever
  */
 
+export type { ProcessedDocument } from "@voltagent/documents";
+
 export { BaseRetriever } from "./retriever";
+export { DocumentRetriever, type DocumentRetrieverOptions } from "./document-retriever";
 export type { Retriever, RetrieverOptions, RetrieveOptions } from "./types";
 export {
   VoltAgentRagRetriever,

diff --git a/packages/documents/README.md b/packages/documents/README.md
@@ -0,0 +1,71 @@
+# @voltagent/documents
+
+Utilities for document processing, chunking, and embedding generation.
+
+## Installation
+
+```bash
+pnpm add @voltagent/documents
+```
+
+## Usage
+
+### Text Splitting
+
+Use `RecursiveCharacterTextSplitter` to split text into chunks while preserving context.
+
+```typescript
+import { RecursiveCharacterTextSplitter } from "@voltagent/documents";
+
+const splitter = new RecursiveCharacterTextSplitter({
+  chunkSize: 1000,
+  chunkOverlap: 200,
+});
+
+const text = "Your long text here...";
+const chunks = await splitter.splitText(text);
+```
+
+### Embeddings
+
+Use `OpenAIEmbeddingModel` to generate embeddings for your text.
+
+```typescript
+import { OpenAIEmbeddingModel } from "@voltagent/documents";
+
+const embedder = new OpenAIEmbeddingModel({
+  apiKey: process.env.OPENAI_API_KEY, // Optional if set in env
+  model: "text-embedding-ada-002", // Default
+});
+
+const embedding = await embedder.embedQuery("Hello world");
+```
+
+### Document Processor
+
+The `DocumentProcessor` combines splitting and embedding.
+
+```typescript
+import { DocumentProcessor } from "@voltagent/documents";
+
+const processor = new DocumentProcessor();
+// Or with custom splitter/embedder:
+// const processor = new DocumentProcessor(uniqueSplitter, uniqueEmbedder);
+
+const documents = await processor.process("Long text content...", {
+  source: "example.txt",
+  author: "Me",
+});
+
+/*
+Returns:
+[
+  {
+    text: "chunk 1...",
+    embedding: [0.123, ...],
+    metadata: { source: "example.txt", author: "Me", chunkIndex: 0, ... }
+  },
+  ...
+]
+*/
+```
diff --git a/packages/documents/package.json b/packages/documents/package.json
@@ -0,0 +1,38 @@
+{
+    "name": "@voltagent/documents",
+    "version": "0.0.1",
+    "description": "Document processing and embedding utilities for VoltAgent",
+    "main": "dist/index.js",
+    "module": "dist/index.mjs",
+    "types": "dist/index.d.ts",
+    "exports": {
+        ".": {
+            "import": {
+                "types": "./dist/index.d.mts",
+                "default": "./dist/index.mjs"
+            },
+            "require": {
+                "types": "./dist/index.d.ts",
+                "default": "./dist/index.js"
+            }
+        }
+    },
+    "files": [
+        "dist"
+    ],
+    "scripts": {
+        "build": "tsup",
+        "dev": "tsup --watch",
+        "typecheck": "tsc --noEmit",
+        "test": "vitest run"
+    },
+    "dependencies": {
+        "openai": "^4.20.0"
+    },
+    "devDependencies": {
+        "tsup": "^8.5.0",
+        "typescript": "^5.8.2",
+        "vitest": "^3.2.4",
+        "@types/node": "^24.2.1"
+    }
+}
diff --git a/packages/documents/src/DocumentProcessor.test.ts b/packages/documents/src/DocumentProcessor.test.ts
@@ -0,0 +1,44 @@
+import { describe, expect, it } from "vitest";
+import { DocumentProcessor } from "./DocumentProcessor";
+import type { EmbeddingModel } from "./embeddings/EmbeddingModel";
+import { TextSplitter } from "./text-splitters/TextSplitter";
+
+class MockSplitter extends TextSplitter {
+  async splitText(text: string): Promise<string[]> {
+    return text.split("|");
+  }
+}
+
+class MockEmbedder implements EmbeddingModel {
+  async embedQuery(_text: string): Promise<number[]> {
+    return [0.1, 0.2];
+  }
+  async embedDocuments(documents: string[]): Promise<number[][]> {
+    return documents.map(() => [0.1, 0.2]);
+  }
+}
+
+describe("DocumentProcessor", () => {
+  it("processes text into documents with embeddings", async () => {
+    const processor = new DocumentProcessor(new MockSplitter(), new MockEmbedder());
+    const result = await processor.process("part1|part2", { file: "test.txt" });
+
+    expect(result).toHaveLength(2);
+
+    expect(result[0].text).toBe("part1");
+    expect(result[0].embedding).toEqual([0.1, 0.2]);
+    expect(result[0].metadata).toEqual({
+      file: "test.txt",
+      chunkIndex: 0,
+      chunkCount: 2,
+    });
+
+    expect(result[1].text).toBe("part2");
+    expect(result[1].embedding).toEqual([0.1, 0.2]);
+    expect(result[1].metadata).toEqual({
+      file: "test.txt",
+      chunkIndex: 1,
+      chunkCount: 2,
+    });
+  });
+});
diff --git a/packages/documents/src/DocumentProcessor.ts b/packages/documents/src/DocumentProcessor.ts
@@ -0,0 +1,35 @@
+import type { EmbeddingModel } from "./embeddings/EmbeddingModel";
+import { OpenAIEmbeddingModel } from "./embeddings/OpenAIEmbeddingModel";
+import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter";
+import type { TextSplitter } from "./text-splitters/TextSplitter";
+
+export interface ProcessedDocument {
+  text: string;
+  embedding: number[];
+  metadata?: Record<string, any>;
+}
+
+export class DocumentProcessor {
+  splitter: TextSplitter;
+  embedder: EmbeddingModel;
+
+  constructor(splitter?: TextSplitter, embedder?: EmbeddingModel) {
+    this.splitter = splitter ?? new RecursiveCharacterTextSplitter();
+    this.embedder = embedder ?? new OpenAIEmbeddingModel();
+  }
+
+  async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
+    const chunks = await this.splitter.splitText(text);
+    const embeddings = await this.embedder.embedDocuments(chunks);
+
+    return chunks.map((chunk, index) => ({
+      text: chunk,
+      embedding: embeddings[index],
+      metadata: {
+        ...metadata,
+        chunkIndex: index,
+        chunkCount: chunks.length,
+      },
+    }));
+  }
-  async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
-    const chunks = await this.splitter.splitText(text);
-    const embeddings = await this.embedder.embedDocuments(chunks);
-
-    return chunks.map((chunk, index) => ({
-      text: chunk,
-      embedding: embeddings[index],
-      metadata: {
-        ...metadata,
-        chunkIndex: index,
-        chunkCount: chunks.length,
-      },
-    }));
-  }
+  async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
+    const chunks = await this.splitter.splitText(text);
+    const embeddings = await this.embedder.embedDocuments(chunks);
+
+    if (embeddings.length !== chunks.length) {
+      throw new Error(
+        `Embedding count mismatch: expected ${chunks.length}, got ${embeddings.length}`
+      );
+    }
+
+    return chunks.map((chunk, index) => ({
+      text: chunk,
+      embedding: embeddings[index],
+      metadata: {
+        ...metadata,
+        chunkIndex: index,
+        chunkCount: chunks.length,
+      },
+    }));
+  }
-  async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
-    const chunks = await this.splitter.splitText(text);
-    const embeddings = await this.embedder.embedDocuments(chunks);
-
-    return chunks.map((chunk, index) => ({
-      text: chunk,
-      embedding: embeddings[index],
-      metadata: {
-        ...metadata,
-        chunkIndex: index,
-        chunkCount: chunks.length,
-      },
-    }));
-  }
+  async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
+    const chunks = await this.splitter.splitText(text);
+    const embeddings = await this.embedder.embedDocuments(chunks);
+
+    if (embeddings.length !== chunks.length) {
+      throw new Error(
+        `Embedding count mismatch: expected ${chunks.length}, got ${embeddings.length}`
+      );
+    }
+
+    return chunks.map((chunk, index) => ({
+      text: chunk,
+      embedding: embeddings[index],
+      metadata: {
+        ...metadata,
+        chunkIndex: index,
+        chunkCount: chunks.length,
+      },
+    }));
+  }
+}
diff --git a/packages/documents/src/RecursiveCharacterTextSplitter.test.ts b/packages/documents/src/RecursiveCharacterTextSplitter.test.ts
@@ -0,0 +1,76 @@
+import { describe, expect, it } from "vitest";
+import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter";
+
+describe("RecursiveCharacterTextSplitter", () => {
+  it("splits text based on characters", async () => {
+    const splitter = new RecursiveCharacterTextSplitter({
+      chunkSize: 10,
+      chunkOverlap: 0,
+    });
+    const text = "abcdefghijklmnopqrstuvwxyz";
+    const chunks = await splitter.splitText(text);
+    // Expect chunks to be size 10 -> "abcdefghij", "klmnopqrst", "uvwxyz"
+    expect(chunks).toEqual(["abcdefghij", "klmnopqrst", "uvwxyz"]);
+  });
+
+  it("splits text with simple separator", async () => {
+    const splitter = new RecursiveCharacterTextSplitter({
+      chunkSize: 10,
+      chunkOverlap: 0,
+      separators: [" "],
+    });
+    const text = "hello world how are you";
+    // "hello world" is 11 chars > 10.
+    // "hello" (5)
+    // "world" (5)
+    // "how" (3)
+    // "are" (3)
+    // "you" (3)
+    // "how are you" -> 3+1+3+1+3 = 11 > 10.
+    // So "how are" (7)
+    // "you" (3)
+    const chunks = await splitter.splitText(text);
+    // My implementation logic:
+    // split by " ". -> ["hello", "world", "how", "are", "you"]
+    // "hello" -> current.
+    // "world" -> len 5. "hello" + 1 + "world" = 11 > 10. Flush "hello". current="world".
+    // "how" -> len 3. "world" + 1 + "how" = 9 <= 10. current="world how".
+    // "are" -> len 3. "world how" + 1 + "are" = 9+1+3=13 > 10. Flush "world how". current="are".
+    // "you" -> len 3. "are" + 1 + "you" = 7 <= 10. current="are you".
+    // Flush "are you".
+
+    expect(chunks).toEqual(["hello", "world how", "are you"]);
+  });
+
+  it("handles recursion with multiple separators", async () => {
+    // This tests the recursion logic
+    const splitter = new RecursiveCharacterTextSplitter({
+      chunkSize: 20,
+      chunkOverlap: 0,
+      separators: ["\n", " "],
+    });
+    // "Para1 word word word" -> 20 chars
+    const text = "Para1 is longer than 20 chars\nPara2 is short";
+    const chunks = await splitter.splitText(text);
+
+    // Split by \n:
+    // "Para1 is longer than 20 chars" (29 chars) -> Too big -> Recurse with [" "]
+    // "Para2 is short" (14 chars) -> Fits.
+
+    // Recursion on "Para1...":
+    // Split by " ": "Para1", "is", "longer", "than", "20", "chars"
+    // Accumulate:
+    // "Para1 is" (8)
+    // + "longer" (6) -> "Para1 is longer" (15)
+    // + "than" (4) -> "Para1 is longer than" (20) -> Perfect fit? (15+1+4=20). Yes.
+    // + "20" (2) -> "Para1 is longer than 20" (23) -> Flush "Para1 is longer than". Current="20".
+    // "chars" -> "20 chars" (8).
+
+    // So result should be:
+    // "Para1 is longer than"
+    // "20 chars"
+    // "Para2 is short"
+
+    expect(chunks).toEqual(["Para1 is longer than", "20 chars", "Para2 is short"]);
+  });
+});
diff --git a/packages/documents/src/embeddings/EmbeddingModel.ts b/packages/documents/src/embeddings/EmbeddingModel.ts
@@ -0,0 +1,4 @@
+export interface EmbeddingModel {
+  embedQuery(text: string): Promise<number[]>;
+  embedDocuments(documents: string[]): Promise<number[][]>;
+}