Skip to content

Commit a202a30

Browse files
committed
Revert "levanter: log per-outlier encode time in BatchTokenizer"
This reverts commit fd2be14.
1 parent fd2be14 commit a202a30

1 file changed

Lines changed: 19 additions & 21 deletions

File tree

lib/levanter/src/levanter/data/text/_batch_tokenizer.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from typing import Sequence, Any
55

66
import regex
7-
from rigging.timing import log_time
87

98
from levanter.data import BatchProcessor
109
from levanter.tokenizers import MarinTokenizer
@@ -104,26 +103,25 @@ def _encode_long_string(self, text: str) -> list[int]:
104103
tokens, regardless of how long the original text is.
105104
"""
106105
ids: list[int] = []
107-
with log_time(f"BatchTokenizer encoded {len(text):,}-char outlier record"):
108-
pieces: list[str] = []
109-
remaining = text
110-
while True:
111-
if len(remaining) > self._workaround_len:
112-
match = ws.search(remaining, self._workaround_len)
113-
split = match.start() if match is not None else len(remaining)
114-
pieces.append(remaining[:split])
115-
remaining = remaining[split:]
116-
else:
117-
pieces.append(remaining)
118-
remaining = ""
119-
120-
if len(pieces) >= _LONG_STRING_BATCH_SIZE or not remaining:
121-
for encoded_piece in self.tokenizer.encode_batch(pieces, add_special_tokens=False):
122-
ids.extend(encoded_piece)
123-
pieces.clear()
124-
125-
if not remaining:
126-
break
106+
pieces: list[str] = []
107+
remaining = text
108+
while True:
109+
if len(remaining) > self._workaround_len:
110+
match = ws.search(remaining, self._workaround_len)
111+
split = match.start() if match is not None else len(remaining)
112+
pieces.append(remaining[:split])
113+
remaining = remaining[split:]
114+
else:
115+
pieces.append(remaining)
116+
remaining = ""
117+
118+
if len(pieces) >= _LONG_STRING_BATCH_SIZE or not remaining:
119+
for encoded_piece in self.tokenizer.encode_batch(pieces, add_special_tokens=False):
120+
ids.extend(encoded_piece)
121+
pieces.clear()
122+
123+
if not remaining:
124+
break
127125

128126
return ids
129127

0 commit comments

Comments
 (0)