Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/document-utils-integration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@voltagent/documents": minor
"@voltagent/core": minor
---

feat: implement document chunking and embedding utilities and integrate into core
3 changes: 2 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"@opentelemetry/sdk-trace-base": "^2.0.0",
"@opentelemetry/sdk-trace-node": "^2.0.0",
"@opentelemetry/semantic-conventions": "^1.28.0",
"@voltagent/documents": "workspace:*",
"@voltagent/internal": "^0.0.12",
"ts-pattern": "^5.7.1",
"type-fest": "^4.41.0",
Expand Down Expand Up @@ -73,4 +74,4 @@
"typecheck": "tsc --noEmit"
},
"types": "dist/index.d.ts"
}
}
73 changes: 73 additions & 0 deletions packages/core/src/retriever/document-retriever.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import { DocumentProcessor, type ProcessedDocument } from "@voltagent/documents";
import type { BaseMessage } from "../agent/providers";
import { BaseRetriever } from "./retriever";
import type { RetrieveOptions, RetrieverOptions } from "./types";

export interface DocumentRetrieverOptions extends RetrieverOptions {
/**
* Optional custom document processor.
* If not provided, a default one will be created.
*/
processor?: DocumentProcessor;
}

/**
* Abstract base class for retrievers that handle document ingestion and vector search.
*/
export abstract class DocumentRetriever extends BaseRetriever {
protected processor: DocumentProcessor;

constructor(options: DocumentRetrieverOptions = {}) {
super(options);
this.processor = options.processor || new DocumentProcessor();
}

/**
* Ingests text, chunks it, embeds it, and stores it using upsertDocuments.
* @param text The raw text to ingest
* @param metadata Optional metadata to attach to all chunks
*/
async ingest(text: string, metadata?: Record<string, any>): Promise<void> {
this.logger.debug("Ingesting document text", { length: text.length });
const documents = await this.processor.process(text, metadata);
await this.upsertDocuments(documents);
this.logger.debug("Document ingestion complete", { chunks: documents.length });
}

/**
* Abstract method to store processed documents in the underlying storage (e.g., Vector DB).
* @param documents The processed documents containing embeddings and metadata
*/
abstract upsertDocuments(documents: ProcessedDocument[]): Promise<void>;

/**
* Abstract method to retrieve documents based on a query vector.
* This is a helper for the main retrieve method.
* @param vector The query vector
* @param k Number of results to return
*/
abstract queryVectors(vector: number[], k: number): Promise<ProcessedDocument[]>;

/**
* Default implementation of retrieve that embeds the query and searches vectors.
* Can be overridden if needed.
*/
async retrieve(input: string | BaseMessage[], options: RetrieveOptions = {}): Promise<string> {
if (Array.isArray(input) && input.length === 0) {
return "";
}
const textQuery = typeof input === "string" ? input : input[input.length - 1].content;

// We assume the processor's embedder has an embedQuery method.
// Since DocumentProcessor exposes 'embedder', we can use it.
const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
Comment on lines +59 to +63
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Validate content type before embedding.

The content property of BaseMessage may not always be a string (e.g., structured content in multi-modal messages). The cast on line 63 could mask runtime issues if content is an object or undefined.

Suggested defensive handling
-    const textQuery = typeof input === "string" ? input : input[input.length - 1].content;
-
-    // We assume the processor's embedder has an embedQuery method.
-    // Since DocumentProcessor exposes 'embedder', we can use it.
-    const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
+    let textQuery: string;
+    if (typeof input === "string") {
+      textQuery = input;
+    } else {
+      const lastContent = input[input.length - 1].content;
+      textQuery = typeof lastContent === "string" ? lastContent : String(lastContent);
+    }
+
+    const queryVector = await this.processor.embedder.embedQuery(textQuery);
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const textQuery = typeof input === "string" ? input : input[input.length - 1].content;
// We assume the processor's embedder has an embedQuery method.
// Since DocumentProcessor exposes 'embedder', we can use it.
const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
let textQuery: string;
if (typeof input === "string") {
textQuery = input;
} else {
const lastContent = input[input.length - 1].content;
textQuery = typeof lastContent === "string" ? lastContent : String(lastContent);
}
const queryVector = await this.processor.embedder.embedQuery(textQuery);
🤖 Prompt for AI Agents
In @packages/core/src/retriever/document-retriever.ts around lines 59 - 63, The
code assumes the selected BaseMessage content is a string and casts it before
calling processor.embedder.embedQuery; instead validate the content type from
input (and the last BaseMessage) before embedding: ensure textQuery is a string,
handle undefined or non-string content by extracting a text field if present
(e.g., content.text), falling back to a safe serialization like
JSON.stringify(content) or returning/logging an error and not calling
embedQuery; replace the direct cast on textQuery and guard the call to
this.processor.embedder.embedQuery accordingly to avoid runtime crashes.


// Default top-k to 4, can be customizable via options
const k = (options as any).k ?? 4;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid any cast; extend RetrieveOptions with proper typing.

Using (options as any).k bypasses type safety. Consider extending RetrieveOptions or creating a dedicated options type for vector retrieval.

Proposed fix

Define proper types for the k parameter:

+export interface DocumentRetrieveOptions extends RetrieveOptions {
+  /**
+   * Number of top results to return from vector search.
+   * @default 4
+   */
+  k?: number;
+}
+
 export abstract class DocumentRetriever extends BaseRetriever {
   // ...
-  async retrieve(input: string | BaseMessage[], options: RetrieveOptions = {}): Promise<string> {
+  async retrieve(input: string | BaseMessage[], options: DocumentRetrieveOptions = {}): Promise<string> {
     // ...
-    const k = (options as any).k ?? 4;
+    const k = options.k ?? 4;
🤖 Prompt for AI Agents
In @packages/core/src/retriever/document-retriever.ts at line 66, The code uses
a cast to any to read options.k; instead extend the RetrieveOptions type (or
create a new VectorRetrieveOptions that extends RetrieveOptions) to include an
optional k?: number, update the method signature to accept that typed options,
and replace the cast line with a typed access (e.g., destructure or read
options.k with a default of 4) so type safety is preserved for the k parameter
in DocumentRetriever/document-retrieval logic.


const results = await this.queryVectors(queryVector, k);

// Join the text of the results
return results.map((doc) => doc.text).join("\n\n");
}
}
3 changes: 3 additions & 0 deletions packages/core/src/retriever/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
* @module retriever
*/

export type { ProcessedDocument } from "@voltagent/documents";

export { BaseRetriever } from "./retriever";
export { DocumentRetriever, type DocumentRetrieverOptions } from "./document-retriever";
export type { Retriever, RetrieverOptions, RetrieveOptions } from "./types";
export {
VoltAgentRagRetriever,
Expand Down
71 changes: 71 additions & 0 deletions packages/documents/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# @voltagent/documents

Utilities for document processing, chunking, and embedding generation.

## Installation

```bash
pnpm add @voltagent/documents
```

## Usage

### Text Splitting

Use `RecursiveCharacterTextSplitter` to split text into chunks while preserving context.

```typescript
import { RecursiveCharacterTextSplitter } from "@voltagent/documents";

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});

const text = "Your long text here...";
const chunks = await splitter.splitText(text);
```

### Embeddings

Use `OpenAIEmbeddingModel` to generate embeddings for your text.

```typescript
import { OpenAIEmbeddingModel } from "@voltagent/documents";

const embedder = new OpenAIEmbeddingModel({
apiKey: process.env.OPENAI_API_KEY, // Optional if set in env
model: "text-embedding-ada-002", // Default
});

const embedding = await embedder.embedQuery("Hello world");
```

### Document Processor

The `DocumentProcessor` combines splitting and embedding.

```typescript
import { DocumentProcessor } from "@voltagent/documents";

const processor = new DocumentProcessor();
// Or with custom splitter/embedder:
// const processor = new DocumentProcessor(uniqueSplitter, uniqueEmbedder);

const documents = await processor.process("Long text content...", {
source: "example.txt",
author: "Me",
});

/*
Returns:
[
{
text: "chunk 1...",
embedding: [0.123, ...],
metadata: { source: "example.txt", author: "Me", chunkIndex: 0, ... }
},
...
]
*/
```
38 changes: 38 additions & 0 deletions packages/documents/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"name": "@voltagent/documents",
"version": "0.0.1",
"description": "Document processing and embedding utilities for VoltAgent",
"main": "dist/index.js",
"module": "dist/index.mjs",
"types": "dist/index.d.ts",
"exports": {
".": {
"import": {
"types": "./dist/index.d.mts",
"default": "./dist/index.mjs"
},
"require": {
"types": "./dist/index.d.ts",
"default": "./dist/index.js"
}
}
},
"files": [
"dist"
],
"scripts": {
"build": "tsup",
"dev": "tsup --watch",
"typecheck": "tsc --noEmit",
"test": "vitest run"
},
"dependencies": {
"openai": "^4.20.0"
},
Comment on lines +29 to +31
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# Check latest openai version and release info
curl -s https://registry.npmjs.org/openai | jq -r '.["dist-tags"] | to_entries[] | "\(.key): \(.value)"'

Repository: VoltAgent/voltagent

Length of output: 137


🌐 Web query:

openai npm package latest version 2025 security vulnerabilities

💡 Result:

Summary (as of Jan 13, 2026):

  • Latest official "openai" npm package (registry) — see package page on npm for the current published version. No direct vulnerabilities are listed for the package in Snyk's database. [1][2]

  • Snyk security scan: "No known security issues" for the latest openai release (Snyk package page). [1]

  • Official npm registry: package/version history and tags are on npmjs (check npm package page to confirm the exact version number you have/plan to install). [2]

Recommended actions:

  • Run npm audit and/or Snyk/Dependabot against your project before installing/upgrading.
  • Inspect your dependency tree for transitive vulnerabilities (audit covers those).
  • If you need, I can check the exact latest version number right now and scan advisories (CVE/GitHub/GitLab) for any security reports — shall I do that?

Sources:
[1] Snyk — openai package security page.
[2] npmjs — openai package page and version listings.


Update OpenAI SDK to a supported version.

The package depends on openai@^4.20.0, which is 2 major versions behind the latest release (6.16.0). While the current version has no known security vulnerabilities, the caret constraint prevents adoption of newer versions. Consider upgrading to a recent version within the 6.x line to benefit from API improvements and continued maintenance.

🤖 Prompt for AI Agents
In @packages/documents/package.json around lines 29 - 31, The package.json
currently pins the "openai" dependency to "^4.20.0"; update that dependency to a
supported 6.x release (e.g., "^6.16.0") in the "dependencies" entry for
"openai", then run your package manager (npm/yarn/pnpm) to install and update
lockfiles; after upgrading, run tests and fix any breaking API changes in code
that uses the OpenAI SDK (search for imports/usages of "openai" and update
client construction and method names per the 6.x migration guide).

"devDependencies": {
"tsup": "^8.5.0",
"typescript": "^5.8.2",
"vitest": "^3.2.4",
"@types/node": "^24.2.1"
}
}
44 changes: 44 additions & 0 deletions packages/documents/src/DocumentProcessor.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import { describe, expect, it } from "vitest";
import { DocumentProcessor } from "./DocumentProcessor";
import type { EmbeddingModel } from "./embeddings/EmbeddingModel";
import { TextSplitter } from "./text-splitters/TextSplitter";

class MockSplitter extends TextSplitter {
async splitText(text: string): Promise<string[]> {
return text.split("|");
}
}

class MockEmbedder implements EmbeddingModel {
async embedQuery(_text: string): Promise<number[]> {
return [0.1, 0.2];
}
async embedDocuments(documents: string[]): Promise<number[][]> {
return documents.map(() => [0.1, 0.2]);
}
}

describe("DocumentProcessor", () => {
it("processes text into documents with embeddings", async () => {
const processor = new DocumentProcessor(new MockSplitter(), new MockEmbedder());
const result = await processor.process("part1|part2", { file: "test.txt" });

expect(result).toHaveLength(2);

expect(result[0].text).toBe("part1");
expect(result[0].embedding).toEqual([0.1, 0.2]);
expect(result[0].metadata).toEqual({
file: "test.txt",
chunkIndex: 0,
chunkCount: 2,
});

expect(result[1].text).toBe("part2");
expect(result[1].embedding).toEqual([0.1, 0.2]);
expect(result[1].metadata).toEqual({
file: "test.txt",
chunkIndex: 1,
chunkCount: 2,
});
});
});
35 changes: 35 additions & 0 deletions packages/documents/src/DocumentProcessor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import type { EmbeddingModel } from "./embeddings/EmbeddingModel";
import { OpenAIEmbeddingModel } from "./embeddings/OpenAIEmbeddingModel";
import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter";
import type { TextSplitter } from "./text-splitters/TextSplitter";

export interface ProcessedDocument {
text: string;
embedding: number[];
metadata?: Record<string, any>;
}

export class DocumentProcessor {
splitter: TextSplitter;
embedder: EmbeddingModel;

constructor(splitter?: TextSplitter, embedder?: EmbeddingModel) {
this.splitter = splitter ?? new RecursiveCharacterTextSplitter();
this.embedder = embedder ?? new OpenAIEmbeddingModel();
}

async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
const chunks = await this.splitter.splitText(text);
const embeddings = await this.embedder.embedDocuments(chunks);

return chunks.map((chunk, index) => ({
text: chunk,
embedding: embeddings[index],
metadata: {
...metadata,
chunkIndex: index,
chunkCount: chunks.length,
},
}));
}
Comment on lines +21 to +34
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Potential undefined embedding if array lengths mismatch.

If embedder.embedDocuments(chunks) returns fewer embeddings than chunks (due to an API error or implementation bug), embeddings[index] will be undefined, leading to corrupted ProcessedDocument objects.

🛡️ Proposed defensive check
 async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
   const chunks = await this.splitter.splitText(text);
   const embeddings = await this.embedder.embedDocuments(chunks);

+  if (embeddings.length !== chunks.length) {
+    throw new Error(
+      `Embedding count mismatch: expected ${chunks.length}, got ${embeddings.length}`
+    );
+  }
+
   return chunks.map((chunk, index) => ({
     text: chunk,
     embedding: embeddings[index],
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
const chunks = await this.splitter.splitText(text);
const embeddings = await this.embedder.embedDocuments(chunks);
return chunks.map((chunk, index) => ({
text: chunk,
embedding: embeddings[index],
metadata: {
...metadata,
chunkIndex: index,
chunkCount: chunks.length,
},
}));
}
async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
const chunks = await this.splitter.splitText(text);
const embeddings = await this.embedder.embedDocuments(chunks);
if (embeddings.length !== chunks.length) {
throw new Error(
`Embedding count mismatch: expected ${chunks.length}, got ${embeddings.length}`
);
}
return chunks.map((chunk, index) => ({
text: chunk,
embedding: embeddings[index],
metadata: {
...metadata,
chunkIndex: index,
chunkCount: chunks.length,
},
}));
}
🤖 Prompt for AI Agents
In @packages/documents/src/DocumentProcessor.ts around lines 21 - 34, In
process, guard against embedder.embedDocuments returning fewer items than
chunks: after const embeddings = await this.embedder.embedDocuments(chunks);
check that embeddings is an array and embeddings.length === chunks.length (or at
least >= chunks.length); if not, either throw a clear error or fill missing
entries with a safe fallback (e.g., null vector or empty embedding) and log the
mismatch via the class logger; ensure the returned ProcessedDocument objects use
validated/fallback embeddings so embeddings[index] cannot be undefined.

}
76 changes: 76 additions & 0 deletions packages/documents/src/RecursiveCharacterTextSplitter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import { describe, expect, it } from "vitest";
import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter";

describe("RecursiveCharacterTextSplitter", () => {
it("splits text based on characters", async () => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 10,
chunkOverlap: 0,
});
const text = "abcdefghijklmnopqrstuvwxyz";
const chunks = await splitter.splitText(text);
// Expect chunks to be size 10 -> "abcdefghij", "klmnopqrst", "uvwxyz"
expect(chunks).toEqual(["abcdefghij", "klmnopqrst", "uvwxyz"]);
});

it("splits text with simple separator", async () => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 10,
chunkOverlap: 0,
separators: [" "],
});
const text = "hello world how are you";
// "hello world" is 11 chars > 10.
// "hello" (5)
// "world" (5)
// "how" (3)
// "are" (3)
// "you" (3)
// "how are you" -> 3+1+3+1+3 = 11 > 10.
// So "how are" (7)
// "you" (3)
const chunks = await splitter.splitText(text);
// My implementation logic:
// split by " ". -> ["hello", "world", "how", "are", "you"]
// "hello" -> current.
// "world" -> len 5. "hello" + 1 + "world" = 11 > 10. Flush "hello". current="world".
// "how" -> len 3. "world" + 1 + "how" = 9 <= 10. current="world how".
// "are" -> len 3. "world how" + 1 + "are" = 9+1+3=13 > 10. Flush "world how". current="are".
// "you" -> len 3. "are" + 1 + "you" = 7 <= 10. current="are you".
// Flush "are you".

expect(chunks).toEqual(["hello", "world how", "are you"]);
});

it("handles recursion with multiple separators", async () => {
// This tests the recursion logic
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 20,
chunkOverlap: 0,
separators: ["\n", " "],
});
// "Para1 word word word" -> 20 chars
const text = "Para1 is longer than 20 chars\nPara2 is short";
const chunks = await splitter.splitText(text);

// Split by \n:
// "Para1 is longer than 20 chars" (29 chars) -> Too big -> Recurse with [" "]
// "Para2 is short" (14 chars) -> Fits.

// Recursion on "Para1...":
// Split by " ": "Para1", "is", "longer", "than", "20", "chars"
// Accumulate:
// "Para1 is" (8)
// + "longer" (6) -> "Para1 is longer" (15)
// + "than" (4) -> "Para1 is longer than" (20) -> Perfect fit? (15+1+4=20). Yes.
// + "20" (2) -> "Para1 is longer than 20" (23) -> Flush "Para1 is longer than". Current="20".
// "chars" -> "20 chars" (8).

// So result should be:
// "Para1 is longer than"
// "20 chars"
// "Para2 is short"

expect(chunks).toEqual(["Para1 is longer than", "20 chars", "Para2 is short"]);
});
});
4 changes: 4 additions & 0 deletions packages/documents/src/embeddings/EmbeddingModel.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
export interface EmbeddingModel {
embedQuery(text: string): Promise<number[]>;
embedDocuments(documents: string[]): Promise<number[][]>;
}
Loading
Loading