Skip to content

Commit a9cffcf

Browse files
feat: closer token estimations to GPT tokenizer
1 parent 51cde0a commit a9cffcf

File tree

2 files changed

+13
-5
lines changed

2 files changed

+13
-5
lines changed

src/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ export function approximateTokenSize(input: string) {
7676
}
7777
else if (ALPHANUMERIC_RE.test(token)) {
7878
// Increase the average token length for alphanumeric strings
79-
tokenCount += Math.ceil(token.length / 4)
79+
tokenCount += Math.ceil(token.length / 5)
8080
}
8181
else if (PUNCTUATION_RE.test(token)) {
8282
// Punctuation is often a single token, but multiple punctuations are often split

test/index.test.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,22 @@ import {
88

99
describe('token-related functions', () => {
1010
describe('approximateTokenSize', () => {
11-
it('should approximate the token size for English text', () => {
11+
it('should approximate the token size for short English text', () => {
1212
const input = 'Hello, world! This is a test.'
13-
expect(approximateTokenSize(input)).toMatchInlineSnapshot('11')
13+
// 9 tokens when using the GPT tokenizer
14+
expect(approximateTokenSize(input)).toMatchInlineSnapshot('9')
15+
})
16+
17+
it('should approximate the token size for Kafka text excerpt', () => {
18+
const input = 'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked. “What’s happened to me?” he thought. It wasn’t a dream. His room, a proper human room although a little too small, lay peacefully between its four familiar walls. A collection of textile samples lay spread out on the table – Samsa was a travelling salesman – and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame. It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm towards the viewer. Gregor then turned to look out the window at the dull weather. Drops of rain could be heard hitting the pane, which made him feel quite sad. “How about if I sleep a little bit longer and forget all this nonsense”, he thought, but that was something he was unable to do because he was used to sleeping on his right, and in his present state couldn’t get into that position.'
19+
// 304 tokens when using the GPT tokenizer
20+
expect(approximateTokenSize(input)).toMatchInlineSnapshot('364')
1421
})
1522

1623
it('should approximate the token size for German text with special characters', () => {
17-
const input = 'Guten Tag! Wie geht’s dir?'
18-
expect(approximateTokenSize(input)).toMatchInlineSnapshot('10')
24+
const input = 'Zwölf Boxkämpfer jagen Viktor quer über den großen Sylter Deich.'
25+
// 22 tokens when using the GPT tokenizer
26+
expect(approximateTokenSize(input)).toMatchInlineSnapshot('15')
1927
})
2028
})
2129

0 commit comments

Comments
 (0)