Skip to content

Commit 85af459

Browse files
authored
feat: add a new sliceByTokens
feat: add a new API sliceByTokens
1 parent d1c8823 commit 85af459

File tree

3 files changed

+194
-8
lines changed

3 files changed

+194
-8
lines changed

README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,51 @@ function isWithinTokenLimit(
138138
): boolean
139139
```
140140

141+
### `sliceByTokens`
142+
143+
Extracts a portion of text based on token positions, similar to `Array.prototype.slice()`. Supports both positive and negative indices.
144+
145+
**Usage:**
146+
147+
```ts
148+
const text = 'Hello, world! This is a test sentence.'
149+
150+
const firstThree = sliceByTokens(text, 0, 3)
151+
const fromSecond = sliceByTokens(text, 2)
152+
const lastTwo = sliceByTokens(text, -2)
153+
const middle = sliceByTokens(text, 1, -1)
154+
155+
// With custom options
156+
const customSlice = sliceByTokens(text, 0, 5, {
157+
defaultCharsPerToken: 4,
158+
languageConfigs: [
159+
{ pattern: /[éèêëàâîï]/i, averageCharsPerToken: 3 }
160+
]
161+
})
162+
```
163+
164+
**Type Declaration:**
165+
166+
```ts
167+
function sliceByTokens(
168+
text: string,
169+
start?: number,
170+
end?: number,
171+
options?: TokenEstimationOptions
172+
): string
173+
```
174+
175+
**Parameters:**
176+
177+
- `text` - The input text to slice
178+
- `start` - The start token index (inclusive). If negative, treated as offset from end. Default: `0`
179+
- `end` - The end token index (exclusive). If negative, treated as offset from end. If omitted, slices to the end
180+
- `options` - Token estimation options (same as `estimateTokenCount`)
181+
182+
**Returns:**
183+
184+
The sliced text portion corresponding to the specified token range.
185+
141186
## License
142187

143188
[MIT](./LICENSE) License © 2023-PRESENT [Johann Schopplich](https://github.com/johannschopplich)

src/index.ts

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,82 @@ function getLanguageSpecificCharsPerToken(segment: string, languageConfigs: Lang
103103
function getCharacterCount(text: string): number {
104104
return Array.from(text).length
105105
}
106+
107+
/**
108+
* Extracts a portion of text based on token positions, similar to Array.prototype.slice().
109+
*/
110+
export function sliceByTokens(
111+
text: string,
112+
start: number = 0,
113+
end?: number,
114+
options: TokenEstimationOptions = {},
115+
): string {
116+
if (!text)
117+
return ''
118+
119+
const { defaultCharsPerToken = DEFAULT_CHARS_PER_TOKEN, languageConfigs = DEFAULT_LANGUAGE_CONFIGS } = options
120+
121+
// Handle negative indices
122+
let totalTokens = 0
123+
if (start < 0 || (end !== undefined && end < 0)) {
124+
totalTokens = estimateTokenCount(text, options)
125+
}
126+
127+
// Normalize indices
128+
const normalizedStart = start < 0 ? Math.max(0, totalTokens + start) : Math.max(0, start)
129+
const normalizedEnd = end === undefined
130+
? Infinity
131+
: end < 0
132+
? Math.max(0, totalTokens + end)
133+
: end
134+
135+
if (normalizedStart >= normalizedEnd)
136+
return ''
137+
138+
// Use same splitting logic as estimateTokenCount for consistency
139+
const segments = text.split(TOKEN_SPLIT_PATTERN).filter(Boolean)
140+
const parts: string[] = []
141+
let currentTokenPos = 0
142+
143+
for (const segment of segments) {
144+
if (currentTokenPos >= normalizedEnd)
145+
break
146+
147+
const tokenCount = estimateSegmentTokens(segment, languageConfigs, defaultCharsPerToken)
148+
const extracted = extractSegmentPart(segment, currentTokenPos, tokenCount, normalizedStart, normalizedEnd)
149+
if (extracted)
150+
parts.push(extracted)
151+
currentTokenPos += tokenCount
152+
}
153+
154+
return parts.join('')
155+
}
156+
157+
/**
158+
* Process segment overlap with target token range
159+
*/
160+
function extractSegmentPart(
161+
segment: string,
162+
segmentTokenStart: number,
163+
segmentTokenCount: number,
164+
targetStart: number,
165+
targetEnd: number,
166+
): string {
167+
if (segmentTokenCount === 0) {
168+
return segmentTokenStart >= targetStart && segmentTokenStart < targetEnd ? segment : ''
169+
}
170+
171+
const segmentTokenEnd = segmentTokenStart + segmentTokenCount
172+
if (segmentTokenStart >= targetEnd || segmentTokenEnd <= targetStart)
173+
return ''
174+
175+
const overlapStart = Math.max(0, targetStart - segmentTokenStart)
176+
const overlapEnd = Math.min(segmentTokenCount, targetEnd - segmentTokenStart)
177+
178+
if (overlapStart === 0 && overlapEnd === segmentTokenCount)
179+
return segment
180+
181+
const charStart = Math.floor((overlapStart / segmentTokenCount) * segment.length)
182+
const charEnd = Math.ceil((overlapEnd / segmentTokenCount) * segment.length)
183+
return segment.slice(charStart, charEnd)
184+
}

test/index.test.ts

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,22 @@ import {
66
approximateTokenSize,
77
estimateTokenCount,
88
isWithinTokenLimit,
9+
sliceByTokens,
910
} from '../src/index'
1011

1112
const fixturesDir = fileURLToPath(new URL('fixtures', import.meta.url))
1213

1314
describe('token-related functions', () => {
15+
const ENGLISH_TEXT = 'Hello, world! This is a short sentence.'
16+
const GERMAN_TEXT = 'Die pünktlich gewünschte Trüffelfüllung im übergestülpten Würzkümmel-Würfel ist kümmerlich und dürfte fürderhin zu Rüffeln in Hülle und Fülle führen'
17+
1418
describe('approximateTokenSize (legacy)', () => {
1519
it('should approximate the token size for short English text', () => {
16-
const input = 'Hello, world! This is a short sentence.'
17-
expect(approximateTokenSize(input)).toMatchInlineSnapshot('11')
20+
expect(approximateTokenSize(ENGLISH_TEXT)).toMatchInlineSnapshot('11')
1821
})
1922

2023
it('should approximate the token size for short German text with umlauts', () => {
21-
const input = 'Die pünktlich gewünschte Trüffelfüllung im übergestülpten Würzkümmel-Würfel ist kümmerlich und dürfte fürderhin zu Rüffeln in Hülle und Fülle führen'
22-
expect(approximateTokenSize(input)).toMatchInlineSnapshot('49')
24+
expect(approximateTokenSize(GERMAN_TEXT)).toMatchInlineSnapshot('49')
2325
})
2426

2527
it('should approximate the token size for English ebook', async () => {
@@ -40,13 +42,11 @@ describe('token-related functions', () => {
4042

4143
describe('estimateTokenCount', () => {
4244
it('should estimate tokens for short English text', () => {
43-
const input = 'Hello, world! This is a short sentence.'
44-
expect(estimateTokenCount(input)).toMatchInlineSnapshot('11')
45+
expect(estimateTokenCount(ENGLISH_TEXT)).toMatchInlineSnapshot('11')
4546
})
4647

4748
it('should estimate tokens for German text with umlauts', () => {
48-
const input = 'Die pünktlich gewünschte Trüffelfüllung im übergestülpten Würzkümmel-Würfel ist kümmerlich und dürfte fürderhin zu Rüffeln in Hülle und Fülle führen'
49-
expect(estimateTokenCount(input)).toMatchInlineSnapshot('49')
49+
expect(estimateTokenCount(GERMAN_TEXT)).toMatchInlineSnapshot('49')
5050
})
5151

5252
it('should handle empty input', () => {
@@ -88,4 +88,66 @@ describe('token-related functions', () => {
8888
expect(isWithinTokenLimit(input, tokenLimit, customOptions)).toBe(false)
8989
})
9090
})
91+
92+
describe('sliceByTokens', () => {
93+
it('should handle empty input and return entire text without bounds', () => {
94+
// Empty input
95+
expect(sliceByTokens('')).toBe('')
96+
expect(sliceByTokens('', 0, 5)).toBe('')
97+
98+
// No bounds - return entire text
99+
expect(sliceByTokens(ENGLISH_TEXT)).toBe(ENGLISH_TEXT)
100+
})
101+
102+
it('should slice English text with positive indices', () => {
103+
// Test specific slice behavior with known English text
104+
const firstTwoTokens = sliceByTokens(ENGLISH_TEXT, 0, 2)
105+
const fromThirdToken = sliceByTokens(ENGLISH_TEXT, 2)
106+
107+
expect(firstTwoTokens).toMatchInlineSnapshot('"Hello,"')
108+
expect(fromThirdToken).toMatchInlineSnapshot('" world! This is a short sentence."')
109+
110+
// Verify they combine to cover most of the original
111+
expect(firstTwoTokens.length + fromThirdToken.length).toBeGreaterThan(ENGLISH_TEXT.length * 0.8)
112+
})
113+
114+
it('should slice German text with positive indices', () => {
115+
// First 3 tokens
116+
const firstThree = sliceByTokens(GERMAN_TEXT, 0, 3)
117+
expect(firstThree).toMatchInlineSnapshot('"Die pünktl"')
118+
119+
// Middle section
120+
const middle = sliceByTokens(GERMAN_TEXT, 5, 10)
121+
expect(middle.length).toBeGreaterThan(0)
122+
expect(middle.length).toBeLessThan(GERMAN_TEXT.length)
123+
})
124+
125+
it('should slice German text with negative indices', () => {
126+
// Last 3 tokens
127+
const lastThree = sliceByTokens(GERMAN_TEXT, -3)
128+
expect(lastThree).toMatchInlineSnapshot('"lle führen"')
129+
130+
// Exclude last 2 tokens
131+
const withoutLastTwo = sliceByTokens(GERMAN_TEXT, 0, -2)
132+
expect(withoutLastTwo.endsWith('Fülle')).toBe(true)
133+
134+
// Both negative indices
135+
const middleNegative = sliceByTokens(GERMAN_TEXT, -8, -3)
136+
expect(middleNegative.length).toBeGreaterThan(0)
137+
expect(middleNegative.includes('Hülle')).toBe(true)
138+
})
139+
140+
it('should handle edge cases', () => {
141+
const totalTokens = estimateTokenCount(GERMAN_TEXT)
142+
143+
// Invalid ranges
144+
expect(sliceByTokens(GERMAN_TEXT, 10, 5)).toBe('')
145+
expect(sliceByTokens(GERMAN_TEXT, 5, 5)).toBe('')
146+
147+
// Out of bounds
148+
expect(sliceByTokens(GERMAN_TEXT, totalTokens + 10)).toBe('')
149+
expect(sliceByTokens(GERMAN_TEXT, 0, totalTokens + 10)).toBe(GERMAN_TEXT)
150+
expect(sliceByTokens(GERMAN_TEXT, -1000)).toBe(GERMAN_TEXT)
151+
})
152+
})
91153
})

0 commit comments

Comments
 (0)