|
4 | 4 | from typing import Sequence, Any |
5 | 5 |
|
6 | 6 | import regex |
| 7 | +from rigging.timing import log_time |
7 | 8 |
|
8 | 9 | from levanter.data import BatchProcessor |
9 | 10 | from levanter.tokenizers import MarinTokenizer |
@@ -103,25 +104,26 @@ def _encode_long_string(self, text: str) -> list[int]: |
103 | 104 | tokens, regardless of how long the original text is. |
104 | 105 | """ |
105 | 106 | ids: list[int] = [] |
106 | | - pieces: list[str] = [] |
107 | | - remaining = text |
108 | | - while True: |
109 | | - if len(remaining) > self._workaround_len: |
110 | | - match = ws.search(remaining, self._workaround_len) |
111 | | - split = match.start() if match is not None else len(remaining) |
112 | | - pieces.append(remaining[:split]) |
113 | | - remaining = remaining[split:] |
114 | | - else: |
115 | | - pieces.append(remaining) |
116 | | - remaining = "" |
117 | | - |
118 | | - if len(pieces) >= _LONG_STRING_BATCH_SIZE or not remaining: |
119 | | - for encoded_piece in self.tokenizer.encode_batch(pieces, add_special_tokens=False): |
120 | | - ids.extend(encoded_piece) |
121 | | - pieces.clear() |
122 | | - |
123 | | - if not remaining: |
124 | | - break |
| 107 | + with log_time(f"BatchTokenizer encoded {len(text):,}-char outlier record"): |
| 108 | + pieces: list[str] = [] |
| 109 | + remaining = text |
| 110 | + while True: |
| 111 | + if len(remaining) > self._workaround_len: |
| 112 | + match = ws.search(remaining, self._workaround_len) |
| 113 | + split = match.start() if match is not None else len(remaining) |
| 114 | + pieces.append(remaining[:split]) |
| 115 | + remaining = remaining[split:] |
| 116 | + else: |
| 117 | + pieces.append(remaining) |
| 118 | + remaining = "" |
| 119 | + |
| 120 | + if len(pieces) >= _LONG_STRING_BATCH_SIZE or not remaining: |
| 121 | + for encoded_piece in self.tokenizer.encode_batch(pieces, add_special_tokens=False): |
| 122 | + ids.extend(encoded_piece) |
| 123 | + pieces.clear() |
| 124 | + |
| 125 | + if not remaining: |
| 126 | + break |
125 | 127 |
|
126 | 128 | return ids |
127 | 129 |
|
|
0 commit comments