1
+ from functools import partial
1
2
from typing import List , Optional
2
3
3
- from torchtune .models .phi3 ._component_builders import phi3 , lora_phi3
4
- from torchtune .models .phi4 ._tokenizer import Phi4MiniTokenizer
4
+ from torchtune .data ._prompt_templates import _get_prompt_template , _TemplateType
5
+
6
+ from torchtune .models .phi3 ._component_builders import lora_phi3 , phi3
7
+ from torchtune .models .phi4 ._tokenizer import Phi4Tokenizer
5
8
6
9
from torchtune .modules import TransformerDecoder
7
10
from torchtune .modules .peft import LORA_ATTN_MODULES
8
- from functools import partial
9
11
from torchtune .modules .tokenizers import parse_hf_tokenizer_json
10
- from torchtune .data ._prompt_templates import _TemplateType
11
- from torchtune .data ._prompt_templates import _get_prompt_template
12
12
13
13
14
14
"""
@@ -36,13 +36,21 @@ def phi4_14b() -> TransformerDecoder:
36
36
norm_eps = 1e-5 ,
37
37
)
38
38
39
- def phi4_14b_tokenizer (vocab_path : str = None , merges_path : str = None , special_tokens_path : Optional [str ] = None , max_seq_len : Optional [int ] = None , prompt_template : Optional [_TemplateType ] = None , truncation_type : str = "right" ) -> Phi4MiniTokenizer :
40
- """Phi4 (14B) tokenizer.
39
+
40
+ def phi4_tokenizer (
41
+ vocab_path : str = None ,
42
+ merges_path : str = None ,
43
+ special_tokens_path : Optional [str ] = None ,
44
+ max_seq_len : Optional [int ] = None ,
45
+ prompt_template : Optional [_TemplateType ] = None ,
46
+ truncation_type : str = "right" ,
47
+ ) -> Phi4Tokenizer :
48
+ """Phi4 tokenizer.
41
49
Args:
42
50
vocab_path (str): Path to vocab.json.
43
51
merges_path (str): Path to merges.txt.
44
52
special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face
45
- model files that contains all registered special tokens, or a local json file
53
+ model files that contains all registered special tokens, or a local json file
46
54
structured similarly. Default is None to use the canonical Phi4 special tokens.
47
55
max_seq_len (Optional[int]): maximum sequence length for tokenizing a single list of messages,
48
56
after which the input will be truncated. Default is None.
@@ -54,11 +62,24 @@ def phi4_14b_tokenizer(vocab_path: str = None, merges_path: str = None, special_
54
62
Default is "right".
55
63
56
64
Returns:
57
- Phi4MiniTokenizer : Instantiation of the Phi-4 (14B) tokenizer.
65
+ Phi4Tokenizer : Instantiation of the Phi-4 (14B) tokenizer.
58
66
"""
59
- special_tokens = parse_hf_tokenizer_json (special_tokens_path ) if special_tokens_path is not None else None
60
- template = _get_prompt_template (prompt_template ) if prompt_template is not None else None
61
- return Phi4MiniTokenizer (vocab_path = vocab_path , merges_path = merges_path , special_tokens = special_tokens , max_seq_len = max_seq_len , prompt_template = template , truncation_type = truncation_type )
67
+ special_tokens = (
68
+ parse_hf_tokenizer_json (special_tokens_path )
69
+ if special_tokens_path is not None
70
+ else None
71
+ )
72
+ template = (
73
+ _get_prompt_template (prompt_template ) if prompt_template is not None else None
74
+ )
75
+ return Phi4Tokenizer (
76
+ vocab_path = vocab_path ,
77
+ merges_path = merges_path ,
78
+ special_tokens = special_tokens ,
79
+ max_seq_len = max_seq_len ,
80
+ prompt_template = template ,
81
+ truncation_type = truncation_type ,
82
+ )
62
83
63
84
64
85
def lora_phi4_14b (
0 commit comments