Skip to content

Commit a43e229

Browse files
ravwojdylaclaude
andcommitted
levanter: log per-outlier encode time in BatchTokenizer
Wrap _encode_long_string in rigging.timing.log_time so wedged or unexpectedly slow outliers surface in worker logs without needing external profiling. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 41a56fe commit a43e229

1 file changed

Lines changed: 21 additions & 19 deletions

File tree

lib/levanter/src/levanter/data/text/_batch_tokenizer.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import Sequence, Any
55

66
import regex
7+
from rigging.timing import log_time
78

89
from levanter.data import BatchProcessor
910
from levanter.tokenizers import MarinTokenizer
@@ -103,25 +104,26 @@ def _encode_long_string(self, text: str) -> list[int]:
103104
tokens, regardless of how long the original text is.
104105
"""
105106
ids: list[int] = []
106-
pieces: list[str] = []
107-
remaining = text
108-
while True:
109-
if len(remaining) > self._workaround_len:
110-
match = ws.search(remaining, self._workaround_len)
111-
split = match.start() if match is not None else len(remaining)
112-
pieces.append(remaining[:split])
113-
remaining = remaining[split:]
114-
else:
115-
pieces.append(remaining)
116-
remaining = ""
117-
118-
if len(pieces) >= _LONG_STRING_BATCH_SIZE or not remaining:
119-
for encoded_piece in self.tokenizer.encode_batch(pieces, add_special_tokens=False):
120-
ids.extend(encoded_piece)
121-
pieces.clear()
122-
123-
if not remaining:
124-
break
107+
with log_time(f"BatchTokenizer encoded {len(text):,}-char outlier record"):
108+
pieces: list[str] = []
109+
remaining = text
110+
while True:
111+
if len(remaining) > self._workaround_len:
112+
match = ws.search(remaining, self._workaround_len)
113+
split = match.start() if match is not None else len(remaining)
114+
pieces.append(remaining[:split])
115+
remaining = remaining[split:]
116+
else:
117+
pieces.append(remaining)
118+
remaining = ""
119+
120+
if len(pieces) >= _LONG_STRING_BATCH_SIZE or not remaining:
121+
for encoded_piece in self.tokenizer.encode_batch(pieces, add_special_tokens=False):
122+
ids.extend(encoded_piece)
123+
pieces.clear()
124+
125+
if not remaining:
126+
break
125127

126128
return ids
127129

0 commit comments

Comments
 (0)