feat: add a new sliceByTokens

alephpiece · web-flow · commit 85af459232aa · 2025-06-23T14:38:24.000+02:00
feat: add a new API sliceByTokens
diff --git a/README.md b/README.md
@@ -138,6 +138,51 @@ function isWithinTokenLimit(
 ): boolean
 ```
 
+### `sliceByTokens`
+
+Extracts a portion of text based on token positions, similar to `Array.prototype.slice()`. Supports both positive and negative indices.
+
+**Usage:**
+
+```ts
+const text = 'Hello, world! This is a test sentence.'
+
+const firstThree = sliceByTokens(text, 0, 3)
+const fromSecond = sliceByTokens(text, 2)
+const lastTwo = sliceByTokens(text, -2)
+const middle = sliceByTokens(text, 1, -1)
+
+// With custom options
+const customSlice = sliceByTokens(text, 0, 5, {
+  defaultCharsPerToken: 4,
+  languageConfigs: [
+    { pattern: /[éèêëàâîï]/i, averageCharsPerToken: 3 }
+  ]
+})
+```
+
+**Type Declaration:**
+
+```ts
+function sliceByTokens(
+  text: string,
+  start?: number,
+  end?: number,
+  options?: TokenEstimationOptions
+): string
+```
+
+**Parameters:**
+
+- `text` - The input text to slice
+- `start` - The start token index (inclusive). If negative, treated as offset from end. Default: `0`
+- `end` - The end token index (exclusive). If negative, treated as offset from end. If omitted, slices to the end
+- `options` - Token estimation options (same as `estimateTokenCount`)
+
+**Returns:**
+
+The sliced text portion corresponding to the specified token range.
+
 ## License
 
 [MIT](./LICENSE) License © 2023-PRESENT [Johann Schopplich](https://github.com/johannschopplich)
diff --git a/src/index.ts b/src/index.ts
@@ -103,3 +103,82 @@ function getLanguageSpecificCharsPerToken(segment: string, languageConfigs: Lang
 function getCharacterCount(text: string): number {
   return Array.from(text).length
 }
+
+/**
+ * Extracts a portion of text based on token positions, similar to Array.prototype.slice().
+ */
+export function sliceByTokens(
+  text: string,
+  start: number = 0,
+  end?: number,
+  options: TokenEstimationOptions = {},
+): string {
+  if (!text)
+    return ''
+
+  const { defaultCharsPerToken = DEFAULT_CHARS_PER_TOKEN, languageConfigs = DEFAULT_LANGUAGE_CONFIGS } = options
+
+  // Handle negative indices
+  let totalTokens = 0
+  if (start < 0 || (end !== undefined && end < 0)) {
+    totalTokens = estimateTokenCount(text, options)
+  }
+
+  // Normalize indices
+  const normalizedStart = start < 0 ? Math.max(0, totalTokens + start) : Math.max(0, start)
+  const normalizedEnd = end === undefined
+    ? Infinity
+    : end < 0
+      ? Math.max(0, totalTokens + end)
+      : end
+
+  if (normalizedStart >= normalizedEnd)
+    return ''
+
+  // Use same splitting logic as estimateTokenCount for consistency
+  const segments = text.split(TOKEN_SPLIT_PATTERN).filter(Boolean)
+  const parts: string[] = []
+  let currentTokenPos = 0
+
+  for (const segment of segments) {
+    if (currentTokenPos >= normalizedEnd)
+      break
+
+    const tokenCount = estimateSegmentTokens(segment, languageConfigs, defaultCharsPerToken)
+    const extracted = extractSegmentPart(segment, currentTokenPos, tokenCount, normalizedStart, normalizedEnd)
+    if (extracted)
+      parts.push(extracted)
+    currentTokenPos += tokenCount
+  }
+
+  return parts.join('')
+}
+
+/**
+ * Process segment overlap with target token range
+ */
+function extractSegmentPart(
+  segment: string,
+  segmentTokenStart: number,
+  segmentTokenCount: number,
+  targetStart: number,
+  targetEnd: number,
+): string {
+  if (segmentTokenCount === 0) {
+    return segmentTokenStart >= targetStart && segmentTokenStart < targetEnd ? segment : ''
+  }
+
+  const segmentTokenEnd = segmentTokenStart + segmentTokenCount
+  if (segmentTokenStart >= targetEnd || segmentTokenEnd <= targetStart)
+    return ''
+
+  const overlapStart = Math.max(0, targetStart - segmentTokenStart)
+  const overlapEnd = Math.min(segmentTokenCount, targetEnd - segmentTokenStart)
+
+  if (overlapStart === 0 && overlapEnd === segmentTokenCount)
+    return segment
+
+  const charStart = Math.floor((overlapStart / segmentTokenCount) * segment.length)
+  const charEnd = Math.ceil((overlapEnd / segmentTokenCount) * segment.length)
+  return segment.slice(charStart, charEnd)
+}
diff --git a/test/index.test.ts b/test/index.test.ts
@@ -6,20 +6,22 @@ import {
   approximateTokenSize,
   estimateTokenCount,
   isWithinTokenLimit,
+  sliceByTokens,
 } from '../src/index'
 
 const fixturesDir = fileURLToPath(new URL('fixtures', import.meta.url))
 
 describe('token-related functions', () => {
+  const ENGLISH_TEXT = 'Hello, world! This is a short sentence.'
+  const GERMAN_TEXT = 'Die pünktlich gewünschte Trüffelfüllung im übergestülpten Würzkümmel-Würfel ist kümmerlich und dürfte fürderhin zu Rüffeln in Hülle und Fülle führen'
+
   describe('approximateTokenSize (legacy)', () => {
     it('should approximate the token size for short English text', () => {
-      const input = 'Hello, world! This is a short sentence.'
-      expect(approximateTokenSize(input)).toMatchInlineSnapshot('11')
+      expect(approximateTokenSize(ENGLISH_TEXT)).toMatchInlineSnapshot('11')
     })
 
     it('should approximate the token size for short German text with umlauts', () => {
-      const input = 'Die pünktlich gewünschte Trüffelfüllung im übergestülpten Würzkümmel-Würfel ist kümmerlich und dürfte fürderhin zu Rüffeln in Hülle und Fülle führen'
-      expect(approximateTokenSize(input)).toMatchInlineSnapshot('49')
+      expect(approximateTokenSize(GERMAN_TEXT)).toMatchInlineSnapshot('49')
     })
 
     it('should approximate the token size for English ebook', async () => {
@@ -40,13 +42,11 @@ describe('token-related functions', () => {
 
   describe('estimateTokenCount', () => {
     it('should estimate tokens for short English text', () => {
-      const input = 'Hello, world! This is a short sentence.'
-      expect(estimateTokenCount(input)).toMatchInlineSnapshot('11')
+      expect(estimateTokenCount(ENGLISH_TEXT)).toMatchInlineSnapshot('11')
     })
 
     it('should estimate tokens for German text with umlauts', () => {
-      const input = 'Die pünktlich gewünschte Trüffelfüllung im übergestülpten Würzkümmel-Würfel ist kümmerlich und dürfte fürderhin zu Rüffeln in Hülle und Fülle führen'
-      expect(estimateTokenCount(input)).toMatchInlineSnapshot('49')
+      expect(estimateTokenCount(GERMAN_TEXT)).toMatchInlineSnapshot('49')
     })
 
     it('should handle empty input', () => {
@@ -88,4 +88,66 @@ describe('token-related functions', () => {
       expect(isWithinTokenLimit(input, tokenLimit, customOptions)).toBe(false)
     })
   })
+
+  describe('sliceByTokens', () => {
+    it('should handle empty input and return entire text without bounds', () => {
+      // Empty input
+      expect(sliceByTokens('')).toBe('')
+      expect(sliceByTokens('', 0, 5)).toBe('')
+
+      // No bounds - return entire text
+      expect(sliceByTokens(ENGLISH_TEXT)).toBe(ENGLISH_TEXT)
+    })
+
+    it('should slice English text with positive indices', () => {
+      // Test specific slice behavior with known English text
+      const firstTwoTokens = sliceByTokens(ENGLISH_TEXT, 0, 2)
+      const fromThirdToken = sliceByTokens(ENGLISH_TEXT, 2)
+
+      expect(firstTwoTokens).toMatchInlineSnapshot('"Hello,"')
+      expect(fromThirdToken).toMatchInlineSnapshot('" world! This is a short sentence."')
+
+      // Verify they combine to cover most of the original
+      expect(firstTwoTokens.length + fromThirdToken.length).toBeGreaterThan(ENGLISH_TEXT.length * 0.8)
+    })
+
+    it('should slice German text with positive indices', () => {
+      // First 3 tokens
+      const firstThree = sliceByTokens(GERMAN_TEXT, 0, 3)
+      expect(firstThree).toMatchInlineSnapshot('"Die pünktl"')
+
+      // Middle section
+      const middle = sliceByTokens(GERMAN_TEXT, 5, 10)
+      expect(middle.length).toBeGreaterThan(0)
+      expect(middle.length).toBeLessThan(GERMAN_TEXT.length)
+    })
+
+    it('should slice German text with negative indices', () => {
+      // Last 3 tokens
+      const lastThree = sliceByTokens(GERMAN_TEXT, -3)
+      expect(lastThree).toMatchInlineSnapshot('"lle führen"')
+
+      // Exclude last 2 tokens
+      const withoutLastTwo = sliceByTokens(GERMAN_TEXT, 0, -2)
+      expect(withoutLastTwo.endsWith('Fülle')).toBe(true)
+
+      // Both negative indices
+      const middleNegative = sliceByTokens(GERMAN_TEXT, -8, -3)
+      expect(middleNegative.length).toBeGreaterThan(0)
+      expect(middleNegative.includes('Hülle')).toBe(true)
+    })
+
+    it('should handle edge cases', () => {
+      const totalTokens = estimateTokenCount(GERMAN_TEXT)
+
+      // Invalid ranges
+      expect(sliceByTokens(GERMAN_TEXT, 10, 5)).toBe('')
+      expect(sliceByTokens(GERMAN_TEXT, 5, 5)).toBe('')
+
+      // Out of bounds
+      expect(sliceByTokens(GERMAN_TEXT, totalTokens + 10)).toBe('')
+      expect(sliceByTokens(GERMAN_TEXT, 0, totalTokens + 10)).toBe(GERMAN_TEXT)
+      expect(sliceByTokens(GERMAN_TEXT, -1000)).toBe(GERMAN_TEXT)
+    })
+  })
 })