fix(data): correct length filtering from character to token level (THUDM#548)

yuzhu-cai · yuzhu-cai · web-flow · commit 79fd101bdeab · 2025-10-24T00:17:18.000+08:00
Co-authored-by: yuzhu-cai &lt;caiyuzhu@gmail.com&gt;
diff --git a/slime/utils/data.py b/slime/utils/data.py
@@ -72,8 +72,9 @@ def __init__(
 
             # TODO: this is slow.
             if max_length is not None:
+                raw_prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
                 if not multimodal_keys:
-                    if len(prompt) > max_length:
+                    if len(raw_prompt_ids) > max_length:
                         continue
 
             self.origin_samples.append(