-
-
Notifications
You must be signed in to change notification settings - Fork 737
Expand file tree
/
Copy pathRecursiveCharacterTextSplitter.test.ts
More file actions
76 lines (68 loc) · 2.75 KB
/
RecursiveCharacterTextSplitter.test.ts
File metadata and controls
76 lines (68 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import { describe, expect, it } from "vitest";
import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter";
describe("RecursiveCharacterTextSplitter", () => {
it("splits text based on characters", async () => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 10,
chunkOverlap: 0,
});
const text = "abcdefghijklmnopqrstuvwxyz";
const chunks = await splitter.splitText(text);
// Expect chunks to be size 10 -> "abcdefghij", "klmnopqrst", "uvwxyz"
expect(chunks).toEqual(["abcdefghij", "klmnopqrst", "uvwxyz"]);
});
it("splits text with simple separator", async () => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 10,
chunkOverlap: 0,
separators: [" "],
});
const text = "hello world how are you";
// "hello world" is 11 chars > 10.
// "hello" (5)
// "world" (5)
// "how" (3)
// "are" (3)
// "you" (3)
// "how are you" -> 3+1+3+1+3 = 11 > 10.
// So "how are" (7)
// "you" (3)
const chunks = await splitter.splitText(text);
// My implementation logic:
// split by " ". -> ["hello", "world", "how", "are", "you"]
// "hello" -> current.
// "world" -> len 5. "hello" + 1 + "world" = 11 > 10. Flush "hello". current="world".
// "how" -> len 3. "world" + 1 + "how" = 9 <= 10. current="world how".
// "are" -> len 3. "world how" + 1 + "are" = 9+1+3=13 > 10. Flush "world how". current="are".
// "you" -> len 3. "are" + 1 + "you" = 7 <= 10. current="are you".
// Flush "are you".
expect(chunks).toEqual(["hello", "world how", "are you"]);
});
it("handles recursion with multiple separators", async () => {
// This tests the recursion logic
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 20,
chunkOverlap: 0,
separators: ["\n", " "],
});
// "Para1 word word word" -> 20 chars
const text = "Para1 is longer than 20 chars\nPara2 is short";
const chunks = await splitter.splitText(text);
// Split by \n:
// "Para1 is longer than 20 chars" (29 chars) -> Too big -> Recurse with [" "]
// "Para2 is short" (14 chars) -> Fits.
// Recursion on "Para1...":
// Split by " ": "Para1", "is", "longer", "than", "20", "chars"
// Accumulate:
// "Para1 is" (8)
// + "longer" (6) -> "Para1 is longer" (15)
// + "than" (4) -> "Para1 is longer than" (20) -> Perfect fit? (15+1+4=20). Yes.
// + "20" (2) -> "Para1 is longer than 20" (23) -> Flush "Para1 is longer than". Current="20".
// "chars" -> "20 chars" (8).
// So result should be:
// "Para1 is longer than"
// "20 chars"
// "Para2 is short"
expect(chunks).toEqual(["Para1 is longer than", "20 chars", "Para2 is short"]);
});
});