phi4 config and style fixes

pbontrager · pbontrager · commit c3e42da5afcd · 2025-03-13T08:42:07.000-07:00
diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
@@ -299,6 +299,26 @@ To download the Qwen2 1.5B model, for example:
     qwen2.lora_qwen2_7b
     qwen2.qwen2_tokenizer
 
+phi-4
+-----
+
+Models from the `Phi-4 family <https://arxiv.org/abs/2412.08905>`_.
+
+To download the Phi-4 instruct model:
+
+.. code-block:: bash
+
+    tune download microsoft/phi-4 --hf-token <HF_TOKEN>
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    phi3.phi4_14b
+    phi3.lora_phi4_14b
+    phi3.qlora_phi4_14b
+    phi3.phi4_tokenizer
+
 phi-3
 -----
 
diff --git a/recipes/configs/phi4/evaluation.yaml b/recipes/configs/phi4/evaluation.yaml
@@ -14,17 +14,21 @@ checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /tmp/phi-4
   checkpoint_files: [
-    model-00001-of-00002.safetensors,
-    model-00002-of-00002.safetensors
+    model-00001-of-00006.safetensors,
+    model-00002-of-00006.safetensors,
+    model-00003-of-00006.safetensors,
+    model-00004-of-00006.safetensors,
+    model-00005-of-00006.safetensors,
+    model-00006-of-00006.safetensors,
   ]
   recipe_checkpoint: null
   output_dir: ${output_dir}
-  model_type: PHI3_MINI
+  model_type: PHI4
 resume_from_checkpoint: False
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.phi4.phi4_14b_tokenizer
+  _component_: torchtune.models.phi4.phi4_tokenizer
   vocab_path: /tmp/phi-4/vocab.json
   merges_path: /tmp/phi-4/merges.txt
   max_seq_len: null
diff --git a/recipes/configs/phi4/full.yaml b/recipes/configs/phi4/full.yaml
@@ -25,7 +25,7 @@ model:
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.phi4.phi4_14b_tokenizer
+  _component_: torchtune.models.phi4.phi4_tokenizer
   vocab_path: /tmp/phi-4/vocab.json
   merges_path: /tmp/phi-4/merges.txt
   max_seq_len: null
@@ -44,7 +44,7 @@ checkpointer:
   ]
   recipe_checkpoint: null
   output_dir: ${output_dir}
-  model_type: PHI3_MINI
+  model_type: PHI4
 resume_from_checkpoint: False
 
 # Dataset
diff --git a/recipes/configs/phi4/full_low_memory.yaml b/recipes/configs/phi4/full_low_memory.yaml
@@ -27,7 +27,7 @@ model:
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.phi4.phi4_14b_tokenizer
+  _component_: torchtune.models.phi4.phi4_tokenizer
   vocab_path: /tmp/phi-4/vocab.json
   merges_path: /tmp/phi-4/merges.txt
   max_seq_len: null
@@ -46,7 +46,7 @@ checkpointer:
   ]
   recipe_checkpoint: null
   output_dir: ${output_dir}
-  model_type: PHI3_MINI
+  model_type: PHI4
 resume_from_checkpoint: False
 
 # Dataset
diff --git a/recipes/configs/phi4/lora.yaml b/recipes/configs/phi4/lora.yaml
@@ -31,7 +31,7 @@ model:
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.phi4.phi4_14b_tokenizer
+  _component_: torchtune.models.phi4.phi4_tokenizer
   vocab_path: /tmp/phi-4/vocab.json
   merges_path: /tmp/phi-4/merges.txt
   max_seq_len: null
@@ -50,7 +50,7 @@ checkpointer:
   ]
   recipe_checkpoint: null
   output_dir: ${output_dir}
-  model_type: PHI3_MINI
+  model_type: PHI4
 resume_from_checkpoint: False
 save_adapter_weights_only: False
 
diff --git a/recipes/configs/phi4/lora_single_device.yaml b/recipes/configs/phi4/lora_single_device.yaml
@@ -29,7 +29,7 @@ model:
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.phi4.phi4_14b_tokenizer
+  _component_: torchtune.models.phi4.phi4_tokenizer
   vocab_path: /tmp/phi-4/vocab.json
   merges_path: /tmp/phi-4/merges.txt
   max_seq_len: null
@@ -48,7 +48,7 @@ checkpointer:
   ]
   recipe_checkpoint: null
   output_dir: ${output_dir}
-  model_type: PHI3_MINI
+  model_type: PHI4
 resume_from_checkpoint: False
 save_adapter_weights_only: False
 
diff --git a/recipes/configs/phi4/qlora_single_device.yaml b/recipes/configs/phi4/qlora_single_device.yaml
@@ -29,7 +29,7 @@ model:
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.phi4.phi4_14b_tokenizer
+  _component_: torchtune.models.phi4.phi4_tokenizer
   vocab_path: /tmp/phi-4/vocab.json
   merges_path: /tmp/phi-4/merges.txt
   max_seq_len: null
@@ -48,7 +48,7 @@ checkpointer:
   ]
   recipe_checkpoint: null
   output_dir: ${output_dir}
-  model_type: PHI3_MINI
+  model_type: PHI4
 resume_from_checkpoint: False
 save_adapter_weights_only: False
 
diff --git a/tests/torchtune/models/phi4/test_phi4_tokenizer.py b/tests/torchtune/models/phi4/test_phi4_tokenizer.py
@@ -10,14 +10,14 @@
 
 from tests.common import ASSETS
 from torchtune.data import Message
-from torchtune.models.phi4 import phi4_14b_tokenizer
+from torchtune.models.phi4 import phi4_tokenizer
 
 
-class TestPhi4MiniTokenizer:
+class TestPhi4Tokenizer:
     @pytest.fixture
     def tokenizer(self):
         # GPT2BaseTokenizer
-        return phi4_14b_tokenizer(
+        return phi4_tokenizer(
             vocab_path=(ASSETS / "vocab.json"),
             merges_path=(ASSETS / "merges.txt"),
         )
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
@@ -522,6 +522,10 @@ class Recipe:
                 name="gemma/evaluation",
                 file_path="gemma/evaluation.yaml",
             ),
+            Config(
+                name="phi4/evaluation",
+                file_path="phi4/evaluation.yaml",
+            ),
             Config(
                 name="phi3/evaluation",
                 file_path="phi3/evaluation.yaml",
diff --git a/torchtune/models/phi4/__init__.py b/torchtune/models/phi4/__init__.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._model_builders import lora_phi4_14b, phi4_14b, phi4_14b_tokenizer  # noqa
+from ._model_builders import lora_phi4_14b, phi4_14b, phi4_tokenizer  # noqa
 
 __all__ = [
     "phi4_14b",
-    "phi4_14b_tokenizer",
+    "phi4_tokenizer",
     "lora_phi4_14b",
 ]
diff --git a/torchtune/models/phi4/_model_builders.py b/torchtune/models/phi4/_model_builders.py
@@ -1,14 +1,14 @@
+from functools import partial
 from typing import List, Optional
 
-from torchtune.models.phi3._component_builders import phi3, lora_phi3
-from torchtune.models.phi4._tokenizer import Phi4MiniTokenizer
+from torchtune.data._prompt_templates import _get_prompt_template, _TemplateType
+
+from torchtune.models.phi3._component_builders import lora_phi3, phi3
+from torchtune.models.phi4._tokenizer import Phi4Tokenizer
 
 from torchtune.modules import TransformerDecoder
 from torchtune.modules.peft import LORA_ATTN_MODULES
-from functools import partial
 from torchtune.modules.tokenizers import parse_hf_tokenizer_json
-from torchtune.data._prompt_templates import _TemplateType
-from torchtune.data._prompt_templates import _get_prompt_template
 
 
 """
@@ -36,13 +36,21 @@ def phi4_14b() -> TransformerDecoder:
         norm_eps=1e-5,
     )
 
-def phi4_14b_tokenizer(vocab_path: str = None, merges_path: str = None, special_tokens_path: Optional[str] = None, max_seq_len: Optional[int] = None, prompt_template: Optional[_TemplateType] = None, truncation_type: str = "right") -> Phi4MiniTokenizer:
-    """Phi4 (14B) tokenizer.
+
+def phi4_tokenizer(
+    vocab_path: str = None,
+    merges_path: str = None,
+    special_tokens_path: Optional[str] = None,
+    max_seq_len: Optional[int] = None,
+    prompt_template: Optional[_TemplateType] = None,
+    truncation_type: str = "right",
+) -> Phi4Tokenizer:
+    """Phi4 tokenizer.
     Args:
         vocab_path (str): Path to vocab.json.
         merges_path (str): Path to merges.txt.
         special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face
-            model files that contains all registered special tokens, or a local json file 
+            model files that contains all registered special tokens, or a local json file
             structured similarly. Default is None to use the canonical Phi4 special tokens.
         max_seq_len (Optional[int]): maximum sequence length for tokenizing a single list of messages,
             after which the input will be truncated. Default is None.
@@ -54,11 +62,24 @@ def phi4_14b_tokenizer(vocab_path: str = None, merges_path: str = None, special_
             Default is "right".
 
     Returns:
-        Phi4MiniTokenizer: Instantiation of the Phi-4 (14B) tokenizer.
+        Phi4Tokenizer: Instantiation of the Phi-4 (14B) tokenizer.
     """
-    special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None
-    template = _get_prompt_template(prompt_template) if prompt_template is not None else None
-    return Phi4MiniTokenizer(vocab_path=vocab_path, merges_path=merges_path, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template, truncation_type=truncation_type)
+    special_tokens = (
+        parse_hf_tokenizer_json(special_tokens_path)
+        if special_tokens_path is not None
+        else None
+    )
+    template = (
+        _get_prompt_template(prompt_template) if prompt_template is not None else None
+    )
+    return Phi4Tokenizer(
+        vocab_path=vocab_path,
+        merges_path=merges_path,
+        special_tokens=special_tokens,
+        max_seq_len=max_seq_len,
+        prompt_template=template,
+        truncation_type=truncation_type,
+    )
 
 
 def lora_phi4_14b(
diff --git a/torchtune/models/phi4/_tokenizer.py b/torchtune/models/phi4/_tokenizer.py
@@ -37,7 +37,7 @@
 CL100K_PATTERN = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""  # noqa
 
 
-class Phi4MiniTokenizer(ModelTokenizer, Transform):
+class Phi4Tokenizer(ModelTokenizer, Transform):
     """
     TikToken tokenizer configured with Phi4 (14B) special tokens.
 
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
@@ -530,10 +530,10 @@ def load_checkpoint(self) -> Dict[str, Any]:
             # delete the state_dict to free up memory; TODO check if this del is needed
             del state_dict
             gc.collect()
-        if self._model_type == ModelType.PHI3_MINI:
+        if self._model_type in (ModelType.PHI3_MINI, ModelType.PHI4):
             log_rank_zero(
                 logger=logger,
-                msg="Converting Phi-3 Mini weights from HF format."
+                msg="Converting Phi weights from HF format."
                 "Note that conversion of adapter weights into PEFT format is not supported.",
             )
             from torchtune.models.phi3._convert_weights import phi3_hf_to_tune
@@ -661,7 +661,7 @@ def save_checkpoint(
         """
         # convert the state_dict back to hf format; do this inplace
         if not adapter_only:
-            if self._model_type == ModelType.PHI3_MINI:
+            if self._model_type in (ModelType.PHI3_MINI, ModelType.PHI4):
                 from torchtune.models.phi3._convert_weights import phi3_tune_to_hf
 
                 state_dict[training.MODEL_KEY] = phi3_tune_to_hf(
@@ -817,9 +817,9 @@ def save_checkpoint(
                 f"saved to {output_path}"
             )
 
-            if self._model_type == ModelType.PHI3_MINI:
+            if self._model_type in (ModelType.PHI3_MINI, ModelType.PHI4):
                 logger.warning(
-                    "Saving Phi-3 Mini adapter weights to PEFT format is not supported, saving to torchtune format instead"
+                    "Saving Phi adapter weights to PEFT format is not supported, saving to torchtune format instead"
                 )
             elif self._model_type == ModelType.LLAMA3_VISION:
                 logger.warning(
@@ -860,9 +860,9 @@ def save_checkpoint(
             )
 
         if training.ADAPTER_CONFIG in state_dict:
-            if self._model_type == ModelType.PHI3_MINI:
+            if self._model_type in (ModelType.PHI3_MINI, ModelType.PHI4):
                 logger.warning(
-                    "PEFT integration for Phi-3 Mini is not supported, skipping adapter config save"
+                    "PEFT integration for Phi is not supported, skipping adapter config save"
                 )
             elif self._model_type == ModelType.LLAMA3_VISION:
                 logger.warning(
diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py
@@ -90,6 +90,7 @@ class ModelType(Enum):
         LLAMA3_VISION (str): LLama3 vision family of models. See :func:`~torchtune.models.llama3_2_vision.llama3_2_vision_decoder`
         MISTRAL (str): Mistral family of models. See :func:`~torchtune.models.mistral.mistral`
         PHI3_MINI (str): Phi-3 family of models. See :func:`~torchtune.models.phi3.phi3`
+        PHI4 (str): Phi-4 family of models. See :func:`~torchtune.models.phi4.phi4`
         REWARD (str): A Llama2, Llama3, or Mistral model with a classification head projecting
             to a single class for reward modelling.
             See :func:`~torchtune.models.mistral.mistral_reward_7b` or :func:`~torchtune.models.llama2.llama2_reward_7b`
@@ -113,6 +114,7 @@ class ModelType(Enum):
     LLAMA3_VISION: str = "llama3_vision"
     MISTRAL: str = "mistral"
     PHI3_MINI: str = "phi3_mini"
+    PHI4: str = "phi4"
     REWARD: str = "reward"
     QWEN2: str = "qwen2"
     CLIP_TEXT: str = "clip_text"