Added estimated token length controls

travis-bauer · travis-bauer · commit 2a9a7f166060 · 2026-06-04T06:52:22.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,9 @@
   `error`): `truncate` shortens by 20% per retry (`truncate_side`), or `chunk_pool` splits into
   `num_chunks` segments and mean-pools. Applied reactively after embed failure; size text upstream
   with `splitText` / `processDocuments` when possible. Batch failures fall back per item so the
-  offending chunk can be identified.
+  offending chunk can be identified. `max_estimated_tokens` can now pre-truncate inputs with a
+  lightweight estimate before embedding; `truncate_side` controls both estimated pre-truncation and
+  reactive `on_token_overflow="truncate"` retries.
 - Added model2vec embedding support via `Model2VecEmbeddingAdapter` and `llmEmbed`
   source `model2vec`, using in-process static embeddings with offline-ready HF cache
   precaching. Install with `pip install talkpipe[model2vec]` or `talkpipe[all]`. Added
diff --git a/docs/guides/model-and-source-configuration.md b/docs/guides/model-and-source-configuration.md
@@ -290,6 +290,12 @@ still too long for the model—not a substitute for upstream chunking.
 contiguous parts and mean-pools to one vector per stream item. If a batch embed fails, TalkPipe
 retries **per item** so you can see which chunk failed.
 
+**Estimated pre-truncation:** set `max_estimated_tokens` on `llmEmbed` to truncate input before
+calling the embedding provider. This uses a lightweight estimate, not the provider tokenizer, so
+`on_token_overflow` remains the fallback if the estimate is optimistic. `truncate_side` is shared
+by both truncation paths: estimated pre-truncation before the provider call, and
+`on_token_overflow="truncate"` after the provider reports a token overflow.
+
 **Batching:** set `batch_size` greater than `1` on `llmEmbed` to call the provider with multiple
 texts per request. The stream still has **one input item and one output item per document**;
 batching is internal only. `llmEmbed` does **not** accept list-shaped stream items (flatten or
@@ -306,6 +312,7 @@ INPUT FROM echo[data="Hello world"]
 ```chatterlang
 | llmEmbed[on_token_overflow="truncate", truncate_side="tail"]
 | llmEmbed[on_token_overflow="chunk_pool", num_chunks=4]
+| llmEmbed[max_estimated_tokens=8192, truncate_side="tail"]
 ```
 
 ### RAG and vector pipelines
diff --git a/src/talkpipe/llm/embedding.py b/src/talkpipe/llm/embedding.py
@@ -1,7 +1,8 @@
 """Module for embedding text using different models"""
 
-from typing import Optional, Annotated, Iterator, Any, List, Literal
 import logging
+import re
+from typing import Optional, Annotated, Iterator, Any, List, Literal
 
 import numpy as np
 
@@ -26,6 +27,13 @@
 _MAX_TRUNCATE_ATTEMPTS = 8
 
 
+def estimate_tokens(text: str) -> int:
+    """Estimate token count without using a provider-specific tokenizer."""
+    chars = len(text)
+    words = len(re.findall(r"\S+", text))
+    return int(max(words * 1.3, chars / 4))
+
+
 class EmbeddingTokenOverflowError(RuntimeError):
     """Raised when embedding fails due to input length and on_token_overflow is error."""
 
@@ -44,6 +52,10 @@ class LLMEmbed(AbstractFieldSegment):
     ``error`` (default), ``truncate`` (shrink and retry), or ``chunk_pool`` (split into
     ``num_chunks`` segments, embed, and mean-pool). Size text before this segment with
     upstream chunking when possible.
+
+    ``max_estimated_tokens`` optionally truncates text before the provider call using
+    a lightweight estimate, not a tokenizer. ``truncate_side`` controls both that
+    proactive truncation and reactive ``on_token_overflow="truncate"`` retry behavior.
     """
 
     def __init__(
@@ -66,6 +78,10 @@ def __init__(
             int,
             "For chunk_pool: number of contiguous segments to split overflow text into",
         ] = 2,
+        max_estimated_tokens: Annotated[
+            Optional[int],
+            "If set, pre-truncate text to this estimated token budget before embedding",
+        ] = None,
     ):
         """Initialize the embedding segment with the specified parameters.
 
@@ -96,18 +112,22 @@ def __init__(
             )
         if num_chunks < 2:
             raise ValueError("num_chunks must be at least 2")
+        if max_estimated_tokens is not None and max_estimated_tokens < 1:
+            raise ValueError("max_estimated_tokens must be a positive integer")
         self.embedder = getEmbeddingAdapter(source)(model=model)
         self.fail_on_error = fail_on_error
         self.batch_size = batch_size
         self.on_token_overflow = on_token_overflow
         self.truncate_side = truncate_side
         self.num_chunks = num_chunks
+        self.max_estimated_tokens = max_estimated_tokens
         self._embedding_source = source
         self._embedding_model = model
 
     def process_value(self, value: Any) -> List[float]:
         """Embed one extracted field value (AbstractFieldSegment hook)."""
-        return self._embed_one_with_overflow_policy(None, str(value))
+        text = self._truncate_to_estimated_token_budget(str(value))
+        return self._embed_one_with_overflow_policy(None, text)
 
     def _input_value(self, item: Any) -> Any:
         """Extract the value to embed (same rule as AbstractFieldSegment)."""
@@ -137,6 +157,23 @@ def _slice_text(text: str, length: int, side: str) -> str:
             return text[start : start + length]
         raise ValueError(f"Unknown truncate_side: {side!r}")
 
+    def _truncate_to_estimated_token_budget(self, text: str) -> str:
+        if self.max_estimated_tokens is None:
+            return text
+        if estimate_tokens(text) <= self.max_estimated_tokens:
+            return text
+
+        low = 0
+        high = len(text)
+        while low < high:
+            mid = (low + high + 1) // 2
+            candidate = self._slice_text(text, mid, self.truncate_side)
+            if estimate_tokens(candidate) <= self.max_estimated_tokens:
+                low = mid
+            else:
+                high = mid - 1
+        return self._slice_text(text, low, self.truncate_side)
+
     @staticmethod
     def _split_num_chunks(text: str, num_chunks: int) -> List[str]:
         n = len(text)
@@ -302,7 +339,7 @@ def flush_buffer() -> Iterator[Any]:
 
             self._ensure_scalar_item(item)
             logging.debug(f"Processing input item: {item}")
-            text = str(self._input_value(item))
+            text = self._truncate_to_estimated_token_budget(str(self._input_value(item)))
             logging.debug(f"Embedding text: {text}")
 
             if self.batch_size <= 1:
diff --git a/tests/talkpipe/llm/test_embedding_token_overflow.py b/tests/talkpipe/llm/test_embedding_token_overflow.py
@@ -3,7 +3,7 @@
 import pytest
 from unittest.mock import Mock
 
-from talkpipe.llm.embedding import LLMEmbed, EmbeddingTokenOverflowError
+from talkpipe.llm.embedding import LLMEmbed, EmbeddingTokenOverflowError, estimate_tokens
 from talkpipe.llm.embedding_errors import is_token_overflow_error
 
 TOKEN_OVERFLOW = RuntimeError("maximum context length exceeded")
@@ -15,6 +15,12 @@ def test_is_token_overflow_error_recognizes_common_messages():
     assert not is_token_overflow_error(RuntimeError("connection reset"))
 
 
+def test_estimate_tokens_uses_word_and_character_heuristics():
+    assert estimate_tokens("") == 0
+    assert estimate_tokens("one two three four") == 5
+    assert estimate_tokens("x" * 100) == 25
+
+
 def _overflow_if_long(max_len: int = 10):
     """Embed succeeds when len(text) <= max_len; otherwise token overflow."""
 
@@ -88,6 +94,29 @@ def test_on_token_overflow_truncate_single_item():
     assert not last_call.startswith("START")
 
 
+def test_max_estimated_tokens_preemptively_truncates_single_item():
+    mock = Mock()
+    mock.execute_one = Mock(side_effect=lambda text: [float(len(text))])
+    mock.execute_batch = Mock()
+    embedder = LLMEmbed(
+        model="test-model",
+        source="ollama",
+        max_estimated_tokens=3,
+        truncate_side="tail",
+    )
+    embedder.embedder = mock
+    long_text = "one two three four five"
+
+    result = list(embedder([long_text]))
+
+    assert result == [[float(len(mock.execute_one.call_args[0][0]))]]
+    sent_text = mock.execute_one.call_args[0][0]
+    assert sent_text != long_text
+    assert long_text.endswith(sent_text)
+    assert estimate_tokens(sent_text) <= 3
+    mock.execute_one.assert_called_once()
+
+
 def test_on_token_overflow_chunk_pool_single_item():
     batch_calls = []
 
@@ -158,6 +187,37 @@ def test_on_token_overflow_truncate_batch_recovers_middle_item():
     mock.execute_batch.assert_called_once()
 
 
+def test_max_estimated_tokens_preemptively_truncates_batch_items():
+    batch_calls = []
+
+    def execute_batch(texts):
+        texts = list(texts)
+        batch_calls.append(texts)
+        return [[float(len(t))] for t in texts]
+
+    mock = Mock()
+    mock.execute_one = Mock()
+    mock.execute_batch = Mock(side_effect=execute_batch)
+    embedder = LLMEmbed(
+        model="test-model",
+        source="ollama",
+        batch_size=2,
+        max_estimated_tokens=2,
+        truncate_side="tail",
+    )
+    embedder.embedder = mock
+    long_text = "alpha beta gamma delta"
+
+    result = list(embedder([long_text, "ok"]))
+
+    assert len(result) == 2
+    assert batch_calls[0][0] != long_text
+    assert long_text.endswith(batch_calls[0][0])
+    assert estimate_tokens(batch_calls[0][0]) <= 2
+    assert batch_calls[0][1] == "ok"
+    mock.execute_one.assert_not_called()
+
+
 def test_on_token_overflow_chunk_pool_batch_recovers_middle_item():
     batch_calls = []
 
@@ -197,3 +257,8 @@ def execute_batch(texts):
 def test_num_chunks_must_be_at_least_two():
     with pytest.raises(ValueError, match="num_chunks"):
         LLMEmbed(model="test-model", source="ollama", num_chunks=1)
+
+
+def test_max_estimated_tokens_must_be_positive():
+    with pytest.raises(ValueError, match="max_estimated_tokens"):
+        LLMEmbed(model="test-model", source="ollama", max_estimated_tokens=0)