[RFC] truncation and skipping (#2419)

krammnic · felipemello1 · Mark Obozov · web-flow · commit 6d72619d7eb5 · 2025-03-06T12:22:34.000-05:00
Co-authored-by: Felipe Mello &lt;fmellomascarenhas@gmail.com&gt;
Co-authored-by: Mark Obozov &lt;markobozov@MacBook-Pro-Mark.local&gt;
Co-authored-by: ebsmothers &lt;ebs@meta.com&gt;
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
@@ -31,7 +31,6 @@
 )
 from torchtune.recipe_interfaces import FTRecipeInterface
 from torchtune.training import DummyProfiler, PROFILER_KEY
-
 from tqdm import tqdm
 
 log = utils.get_logger("DEBUG")
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
@@ -533,7 +533,6 @@ def train(self) -> None:
                     == self.max_steps_per_epoch
                 ):
                     break
-
                 # batch is input_ids, labels
                 num_tokens += batch[0].numel()
                 policy_chosen_rejected_outputs = self.concatenated_forward(
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -36,7 +36,6 @@
 )
 from torchtune.recipe_interfaces import FTRecipeInterface
 from torchtune.training import DummyProfiler, PROFILER_KEY
-
 from tqdm import tqdm
 
 log = utils.get_logger("DEBUG")
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
@@ -922,7 +922,6 @@ def train(self) -> None:
             self._sampler.set_epoch(curr_epoch)
 
             for idx, batch in enumerate(self._dataloader):
-
                 # Start tracking CUDA memory for active steps for just the first epoch
                 if (
                     curr_epoch == 0
diff --git a/tests/torchtune/data/test_data_utils.py b/tests/torchtune/data/test_data_utils.py
@@ -18,18 +18,24 @@ def test_truncate():
     tokens = [1, 2, 3, 4, -1]
 
     # Test no truncation
-    truncated_tokens = truncate(
-        tokens=tokens,
-        max_seq_len=5,
-        eos_id=-1,
-    )
+    truncated_tokens = truncate(tokens=tokens, max_seq_len=5, eos_id=-1)
     assert truncated_tokens == tokens
 
     masks = [True, True, False, True, False]
     # Test truncated mask
-    truncated_masks = truncate(tokens=masks, max_seq_len=4, eos_id=False)
+    truncated_masks = truncate(
+        tokens=masks, max_seq_len=4, eos_id=False, truncation_type="right"
+    )
+
     assert truncated_masks == [True, True, False, False]
 
+    # Test right truncation
+    truncated_masks = truncate(
+        tokens=masks, max_seq_len=4, eos_id=False, truncation_type="left"
+    )
+
+    assert truncated_masks == [True, False, True, False]
+
 
 def test_format_content_with_images():
     test_image_1 = Image.new(mode="RGB", size=(4, 4))
diff --git a/tests/torchtune/models/llama2/test_llama2_tokenizer.py b/tests/torchtune/models/llama2/test_llama2_tokenizer.py
@@ -61,6 +61,8 @@ def test_tokenize_messages(self, messages, expected_tokens):
         tokens, mask = tokenizer.tokenize_messages(messages)
         # Mask user, unmask assistant, add EOS token
         expected_mask = [True] * 75 + [False] * 125
+
+        assert len(tokens) == len(mask)
         assert expected_tokens == tokens
         assert expected_mask == mask
 
diff --git a/torchtune/data/_utils.py b/torchtune/data/_utils.py
@@ -26,6 +26,7 @@ def truncate(
     tokens: List[Any],
     max_seq_len: int,
     eos_id: Optional[Any] = None,
+    truncation_type: str = "right",
 ) -> List[Any]:
     """
     Truncate a list of tokens to a maximum length. If eos_id is provided, the last
@@ -36,13 +37,29 @@ def truncate(
         max_seq_len (int): maximum length of the list
         eos_id (Optional[Any]): token to replace the last token with. If None, the
             last token will not be replaced. Default is None.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Returns:
         List[Any]: truncated list of tokens
+
+    Raises:
+        ValueError: if truncation_type is not "left" or "right"
     """
-    tokens_truncated = tokens[:max_seq_len]
-    if eos_id is not None and tokens_truncated[-1] != eos_id:
+
+    if truncation_type == "left":
+        tokens_truncated = tokens[-max_seq_len:]  # Take the last max_seq_len tokens
+    elif truncation_type == "right":
+        tokens_truncated = tokens[:max_seq_len]  # Take the first max_seq_len tokens
+    else:
+        raise ValueError(
+            f"truncation_type must be 'left' or 'right', got {truncation_type}"
+        )
+
+    # Replace the last token with eos_id if necessary
+    if eos_id is not None and tokens_truncated and tokens_truncated[-1] != eos_id:
         tokens_truncated[-1] = eos_id
+
     return tokens_truncated
 
 
diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py
@@ -136,6 +136,7 @@ def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, List[int]]:
         chosen_input_ids, chosen_masks = self._tokenizer.tokenize_messages(
             transformed_sample["chosen"],
         )
+
         chosen_labels = list(
             np.where(chosen_masks, CROSS_ENTROPY_IGNORE_IDX, chosen_input_ids)
         )
diff --git a/torchtune/models/gemma/_model_builders.py b/torchtune/models/gemma/_model_builders.py
@@ -43,7 +43,7 @@ def gemma_2b() -> TransformerDecoder:
     )
 
 
-def gemma_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = None) -> GemmaTokenizer:
+def gemma_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = None, truncation_type: str = "right") -> GemmaTokenizer:
     """
     Tokenizer for Gemma.
 
@@ -55,12 +55,13 @@ def gemma_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_templat
             If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
             class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
             prepend/append tags.
-        
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Returns:
         GemmaTokenizer: Instantiation of the Gemma tokenizer
     """
-    return GemmaTokenizer(path=path, max_seq_len=max_seq_len, prompt_template=_get_prompt_template(prompt_template) if prompt_template is not None else None)
+    return GemmaTokenizer(path=path, max_seq_len=max_seq_len, prompt_template=_get_prompt_template(prompt_template) if prompt_template is not None else None, truncation_type=truncation_type)
 
 
 def lora_gemma_2b(
diff --git a/torchtune/models/gemma/_tokenizer.py b/torchtune/models/gemma/_tokenizer.py
@@ -34,6 +34,8 @@ class GemmaTokenizer(ModelTokenizer, Transform):
             - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`
 
             The extra text will still get tokenized as normal text, not as special tokens. Default is None.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Examples:
         >>> tokenizer = GemmaTokenizer("/path/to/spm_model")
@@ -47,6 +49,7 @@ def __init__(
         path: str,
         max_seq_len: Optional[int] = None,
         prompt_template: Optional[PromptTemplate] = None,
+        truncation_type: str = "right",
     ):
         self._spm_model = SentencePieceBaseTokenizer(path)
 
@@ -59,6 +62,7 @@ def __init__(
         self.max_seq_len = max_seq_len
 
         self.prompt_template = prompt_template
+        self.truncation_type = truncation_type
 
     @property
     def eos_id(self):
@@ -142,6 +146,7 @@ def tokenize_messages(
             messages=templated_messages,
             bos_id=self.bos_id,
             eos_id=self.eos_id if add_eos else None,
+            truncation_type=self.truncation_type,
         )
 
     def __call__(
diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py
@@ -42,7 +42,7 @@ def llama2_7b() -> TransformerDecoder:
     )
 
 
-def llama2_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = "torchtune.models.llama2.Llama2ChatTemplate") -> Llama2Tokenizer:
+def llama2_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = "torchtune.models.llama2.Llama2ChatTemplate", truncation_type: str = "right") -> Llama2Tokenizer:
     """
     Tokenizer for Llama2.
 
@@ -54,11 +54,12 @@ def llama2_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_templa
             If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
             class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
             prepend/append tags. Default is :class:`~torchtune.models.llama2.Llama2ChatTemplate`.
-
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
     Returns:
         Llama2Tokenizer: Instantiation of the Llama2 tokenizer
     """
-    return Llama2Tokenizer(path=path, max_seq_len=max_seq_len, prompt_template=_get_prompt_template(prompt_template) if prompt_template is not None else None)
+    return Llama2Tokenizer(path=path, max_seq_len=max_seq_len, prompt_template=_get_prompt_template(prompt_template) if prompt_template is not None else None, truncation_type=truncation_type)
 
 
 def lora_llama2_7b(
diff --git a/torchtune/models/llama2/_tokenizer.py b/torchtune/models/llama2/_tokenizer.py
@@ -44,6 +44,8 @@ class Llama2Tokenizer(ModelTokenizer, Transform):
 
             The extra text will still get tokenized as normal text, not as special tokens.
             Default is :class:`~torchtune.models.llama2.Llama2ChatTemplate`.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Examples:
         >>> tokenizer = Llama2Tokenizer("/path/to/spm_model")
@@ -57,6 +59,7 @@ def __init__(
         path: str,
         max_seq_len: Optional[int] = None,
         prompt_template: Optional[PromptTemplate] = Llama2ChatTemplate(),
+        truncation_type: str = "right",
     ):
         self._spm_model = SentencePieceBaseTokenizer(path)
 
@@ -69,6 +72,7 @@ def __init__(
         self.max_seq_len = max_seq_len
 
         self.prompt_template = prompt_template
+        self.truncation_type = truncation_type
 
     @property
     def eos_id(self):
@@ -159,6 +163,7 @@ def tokenize_messages(
             messages=templated_messages,
             bos_id=self.bos_id if add_start_tokens else None,
             eos_id=self.eos_id if add_end_tokens else None,
+            truncation_type=self.truncation_type,
         )
 
     def __call__(
diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py
@@ -70,6 +70,7 @@ def llama3_tokenizer(
     special_tokens_path: Optional[str] = None,
     max_seq_len: Optional[int] = None,
     prompt_template: Optional[_TemplateType] = None,
+    truncation_type: str = "right",
 ) -> Llama3Tokenizer:
     """
     Tokenizer for Llama3.
@@ -85,6 +86,8 @@ def llama3_tokenizer(
             If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
             class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
             prepend/append tags.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Returns:
         Llama3Tokenizer: Instantiation of the Llama3 tokenizer
@@ -102,6 +105,7 @@ def llama3_tokenizer(
         special_tokens=special_tokens,
         max_seq_len=max_seq_len,
         prompt_template=template,
+        truncation_type=truncation_type,
     )
 
 
diff --git a/torchtune/models/llama3/_tokenizer.py b/torchtune/models/llama3/_tokenizer.py
@@ -65,6 +65,8 @@ class Llama3Tokenizer(ModelTokenizer, Transform):
             - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`
 
             The extra text will still get tokenized as normal text, not as special tokens. Default is None.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Examples:
         >>> tokenizer = Llama3Tokenizer("/path/to/tt_model")
@@ -79,6 +81,7 @@ def __init__(
         special_tokens: Optional[Dict[str, int]] = None,
         max_seq_len: Optional[int] = None,
         prompt_template: Optional[PromptTemplate] = None,
+        truncation_type: str = "right",
     ):
         self.special_tokens = (
             special_tokens if special_tokens is not None else LLAMA3_SPECIAL_TOKENS
@@ -124,6 +127,8 @@ def __init__(
             r"<\|start_header_id\|>.*?<\|end_header_id\|>\n\n"
         )
 
+        self.truncation_type = truncation_type
+
     def _validate_special_tokens(
         self,
     ):
@@ -324,9 +329,17 @@ def tokenize_messages(
 
         if self.max_seq_len:
             tokens = truncate(
-                tokens, self.max_seq_len, self.eos_id if add_end_tokens else None
+                tokens=tokens,
+                max_seq_len=self.max_seq_len,
+                eos_id=self.eos_id if add_end_tokens else None,
+                truncation_type=self.truncation_type,
+            )
+            mask = truncate(
+                tokens=mask,
+                max_seq_len=self.max_seq_len,
+                eos_id=True if add_end_tokens else None,
+                truncation_type=self.truncation_type,
             )
-            mask = truncate(mask, self.max_seq_len, True if add_end_tokens else None)
 
         return tokens, mask
 
diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py
@@ -48,7 +48,7 @@ def mistral_7b() -> TransformerDecoder:
     )
 
 
-def mistral_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = "torchtune.models.mistral.MistralChatTemplate") -> MistralTokenizer:
+def mistral_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = "torchtune.models.mistral.MistralChatTemplate", truncation_type: str = "right",) -> MistralTokenizer:
     """
     Tokenizer for Mistral models.
 
@@ -60,11 +60,13 @@ def mistral_tokenizer(path: str, max_seq_len: Optional[int] = None, prompt_templ
             If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
             class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
             prepend/append tags. Default is :class:`~torchtune.models.mistral.MistralChatTemplate`.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Returns:
         MistralTokenizer: Instantiation of the Mistral tokenizer
     """
-    return MistralTokenizer(path=path, max_seq_len=max_seq_len, prompt_template=_get_prompt_template(prompt_template) if prompt_template is not None else None)
+    return MistralTokenizer(path=path, max_seq_len=max_seq_len, prompt_template=_get_prompt_template(prompt_template) if prompt_template is not None else None, truncation_type=truncation_type)
 
 
 def lora_mistral_7b(
diff --git a/torchtune/models/mistral/_tokenizer.py b/torchtune/models/mistral/_tokenizer.py
@@ -36,6 +36,8 @@ class MistralTokenizer(ModelTokenizer, Transform):
 
             The extra text will still get tokenized as normal text, not as special tokens.
             Default is :class:`~torchtune.models.mistral.MistralChatTemplate`.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Examples:
         >>> tokenizer = MistralTokenizer("/path/to/spm_model")
@@ -49,6 +51,7 @@ def __init__(
         path: str,
         max_seq_len: Optional[int] = None,
         prompt_template: Optional[PromptTemplate] = MistralChatTemplate(),
+        truncation_type: str = "right",
     ):
         self._spm_model = SentencePieceBaseTokenizer(path)
 
@@ -61,6 +64,7 @@ def __init__(
         self.max_seq_len = max_seq_len
 
         self.prompt_template = prompt_template
+        self.truncation_type = truncation_type
 
     @property
     def eos_id(self):
@@ -172,6 +176,7 @@ def tokenize_messages(
             messages=templated_messages,
             bos_id=self.bos_id,
             eos_id=self.eos_id if add_eos else None,
+            truncation_type=self.truncation_type,
         )
 
     def __call__(
diff --git a/torchtune/models/phi3/_model_builders.py b/torchtune/models/phi3/_model_builders.py
@@ -41,7 +41,7 @@ def phi3_mini() -> TransformerDecoder:
         norm_eps=1e-5,
     )
 
-def phi3_mini_tokenizer(path: str, special_tokens_path: Optional[str] = None, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = None) -> Phi3MiniTokenizer:
+def phi3_mini_tokenizer(path: str, special_tokens_path: Optional[str] = None, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = None, truncation_type: str = "right",) -> Phi3MiniTokenizer:
     """Phi-3 Mini tokenizer.
     Ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/tokenizer_config.json
 
@@ -56,6 +56,8 @@ def phi3_mini_tokenizer(path: str, special_tokens_path: Optional[str] = None, ma
             If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
             class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
             prepend/append tags.
+        truncation_type (str): type of truncation to apply, either "left" or "right".
+            Default is "right".
 
     Note:
         This tokenizer includes typical LM EOS and BOS tokens like
@@ -68,7 +70,7 @@ def phi3_mini_tokenizer(path: str, special_tokens_path: Optional[str] = None, ma
     """
     special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None
     template = _get_prompt_template(prompt_template) if prompt_template is not None else None
-    return Phi3MiniTokenizer(path=path, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template)
+    return Phi3MiniTokenizer(path=path, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template, truncation_type=truncation_type)
 
 
 def lora_phi3_mini(
diff --git a/torchtune/models/phi3/_tokenizer.py b/torchtune/models/phi3/_tokenizer.py
diff --git a/torchtune/models/phi4/_model_builders.py b/torchtune/models/phi4/_model_builders.py
diff --git a/torchtune/models/phi4/_tokenizer.py b/torchtune/models/phi4/_tokenizer.py
diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py
diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py
diff --git a/torchtune/models/qwen2_5/_model_builders.py b/torchtune/models/qwen2_5/_model_builders.py
diff --git a/torchtune/models/qwen2_5/_tokenizer.py b/torchtune/models/qwen2_5/_tokenizer.py
diff --git a/torchtune/modules/transforms/tokenizers/_utils.py b/torchtune/modules/transforms/tokenizers/_utils.py

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,6 @@`
`31`	`31`	`)`
`32`	`32`	`from torchtune.recipe_interfaces import FTRecipeInterface`
`33`	`33`	`from torchtune.training import DummyProfiler, PROFILER_KEY`
`34`		`-`
`35`	`34`	`from tqdm import tqdm`
`36`	`35`
`37`	`36`	`log = utils.get_logger("DEBUG")`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,6 @@`
`36`	`36`	`)`
`37`	`37`	`from torchtune.recipe_interfaces import FTRecipeInterface`
`38`	`38`	`from torchtune.training import DummyProfiler, PROFILER_KEY`
`39`		`-`
`40`	`39`	`from tqdm import tqdm`
`41`	`40`
`42`	`41`	`log = utils.get_logger("DEBUG")`
Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,7 @@ def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, List[int]]:`
`136`	`136`	`chosen_input_ids, chosen_masks = self._tokenizer.tokenize_messages(`
`137`	`137`	`transformed_sample["chosen"],`
`138`	`138`	`)`
	`139`	`+`
`139`	`140`	`chosen_labels = list(`
`140`	`141`	`np.where(chosen_masks, CROSS_ENTROPY_IGNORE_IDX, chosen_input_ids)`
`141`	`142`	`)`