@@ -1013,6 +1013,15 @@ public override OperationStatus Decode(IEnumerable<int> ids, Span<char> destinat
10131013 private const string IMStart = "<|im_start|>" ;
10141014 private const string IMEnd = "<|im_end|>" ;
10151015 private const string IMSep = "<|im_sep|>" ;
1016+ private const string StartOfText = "<|startoftext|>" ;
1017+ private const string Return = "<|return|>" ;
1018+ private const string Constrain = "<|constrain|>" ;
1019+ private const string Channel = "<|channel|>" ;
1020+ private const string Start = "<|start|>" ;
1021+ private const string End = "<|end|>" ;
1022+ private const string Message = "<|message|>" ;
1023+ private const string Call = "<|call|>" ;
1024+ private const string ReservedPrefix = "<|reserved_" ;
10161025
10171026 private enum ModelEncoding
10181027 {
@@ -1022,40 +1031,69 @@ private enum ModelEncoding
10221031 P50kEdit ,
10231032 R50kBase ,
10241033 GPT2 ,
1025- O200kBase
1034+ O200kBase ,
1035+ O200kHarmony
10261036 }
10271037
10281038 private const string Phi4ModelName = "phi-4" ;
10291039
10301040 private static readonly ( string Prefix , ModelEncoding Encoding ) [ ] _modelPrefixToEncoding =
10311041 [
1032- // chat
10331042 ( "o1-" , ModelEncoding . O200kBase ) , // e.g. o1-mini
10341043 ( "o3-" , ModelEncoding . O200kBase ) , // e.g. o3-mini
1044+ ( "o4-mini-" , ModelEncoding . O200kBase ) , // e.g. o4-mini
1045+
1046+ // chat
1047+ ( "gpt-5-" , ModelEncoding . O200kBase ) ,
10351048 ( "gpt-4.1-" , ModelEncoding . O200kBase ) , // e.g., gpt-4.1-mini
1049+ ( "gpt-4.5-" , ModelEncoding . O200kBase ) , // e.g., gpt-4.5
10361050 ( "gpt-4o-" , ModelEncoding . O200kBase ) , // e.g., gpt-4o-2024-05-13
1051+ ( "chatgpt-4o-" , ModelEncoding . O200kBase ) ,
10371052 ( "gpt-4-" , ModelEncoding . Cl100kBase ) , // e.g., gpt-4-0314, etc., plus gpt-4-32k
10381053 ( "gpt-3.5-" , ModelEncoding . Cl100kBase ) , // e.g, gpt-3.5-turbo-0301, -0401, etc.
1039- ( "gpt-35-" , ModelEncoding . Cl100kBase ) // Azure deployment name
1054+ ( "gpt-35-" , ModelEncoding . Cl100kBase ) , // Azure deployment name
1055+ ( "gpt-oss-" , ModelEncoding . O200kHarmony ) ,
1056+
1057+ // fine-tuned
1058+ ( "ft:gpt-4o" , ModelEncoding . O200kBase ) ,
1059+ ( "ft:gpt-4" , ModelEncoding . Cl100kBase ) ,
1060+ ( "ft:gpt-3.5-turbo" , ModelEncoding . Cl100kBase ) ,
1061+ ( "ft:davinci-002" , ModelEncoding . Cl100kBase ) ,
1062+ ( "ft:babbage-002" , ModelEncoding . Cl100kBase ) ,
10401063 ] ;
10411064
10421065 private static readonly Dictionary < string , ModelEncoding > _modelToEncoding =
10431066 new Dictionary < string , ModelEncoding > ( StringComparer . OrdinalIgnoreCase )
10441067 {
1045- // chat
1046- { "gpt-4o" , ModelEncoding . O200kBase } ,
1068+ // reasoning
10471069 { "o1" , ModelEncoding . O200kBase } ,
10481070 { "o3" , ModelEncoding . O200kBase } ,
10491071 { "o4-mini" , ModelEncoding . O200kBase } ,
1072+
1073+ // chat
1074+ { "gpt-5" , ModelEncoding . O200kBase } ,
10501075 { "gpt-4.1" , ModelEncoding . O200kBase } ,
1076+ { "gpt-4o" , ModelEncoding . O200kBase } ,
10511077 { "gpt-4" , ModelEncoding . Cl100kBase } ,
10521078 { "gpt-3.5-turbo" , ModelEncoding . Cl100kBase } ,
1079+ { "gpt-3.5" , ModelEncoding . Cl100kBase } ,
10531080 { "gpt-3.5-turbo-16k" , ModelEncoding . Cl100kBase } ,
10541081 { "gpt-35" , ModelEncoding . Cl100kBase } , // Azure deployment name
10551082 { "gpt-35-turbo" , ModelEncoding . Cl100kBase } , // Azure deployment name
10561083 { "gpt-35-turbo-16k" , ModelEncoding . Cl100kBase } , // Azure deployment name
10571084
1058- // text
1085+ // Base
1086+ { "davinci-002" , ModelEncoding . Cl100kBase } ,
1087+ { "babbage-002" , ModelEncoding . Cl100kBase } ,
1088+
1089+ // embeddings
1090+ // https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
1091+ { "text-embedding-ada-002" , ModelEncoding . Cl100kBase } ,
1092+ { "text-embedding-3-small" , ModelEncoding . Cl100kBase } ,
1093+ { "text-embedding-3-large" , ModelEncoding . Cl100kBase } ,
1094+
1095+ // DEPRECATED MODELS
1096+ // text (DEPRECATED)
10591097 { "text-davinci-003" , ModelEncoding . P50kBase } ,
10601098 { "text-davinci-002" , ModelEncoding . P50kBase } ,
10611099 { "text-davinci-001" , ModelEncoding . R50kBase } ,
@@ -1067,25 +1105,20 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
10671105 { "babbage" , ModelEncoding . R50kBase } ,
10681106 { "ada" , ModelEncoding . R50kBase } ,
10691107
1070- // code
1108+ // code (DEPRECATED)
10711109 { "code-davinci-002" , ModelEncoding . P50kBase } ,
10721110 { "code-davinci-001" , ModelEncoding . P50kBase } ,
10731111 { "code-cushman-002" , ModelEncoding . P50kBase } ,
10741112 { "code-cushman-001" , ModelEncoding . P50kBase } ,
10751113 { "davinci-codex" , ModelEncoding . P50kBase } ,
10761114 { "cushman-codex" , ModelEncoding . P50kBase } ,
10771115
1078- // edit
1116+ // edit (DEPRECATED)
10791117 { "text-davinci-edit-001" , ModelEncoding . P50kEdit } ,
10801118 { "code-davinci-edit-001" , ModelEncoding . P50kEdit } ,
10811119
1082- // embeddings
1083- // https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
1084- { "text-embedding-ada-002" , ModelEncoding . Cl100kBase } ,
1085- { "text-embedding-3-small" , ModelEncoding . Cl100kBase } ,
1086- { "text-embedding-3-large" , ModelEncoding . Cl100kBase } ,
10871120
1088- // old embeddings
1121+ // old embeddings (DEPRECATED)
10891122 { "text-similarity-davinci-001" , ModelEncoding . R50kBase } ,
10901123 { "text-similarity-curie-001" , ModelEncoding . R50kBase } ,
10911124 { "text-similarity-babbage-001" , ModelEncoding . R50kBase } ,
@@ -1099,6 +1132,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
10991132
11001133 // open source
11011134 { "gpt2" , ModelEncoding . GPT2 } ,
1135+ { "gpt-2" , ModelEncoding . GPT2 } ,
11021136
11031137 // phi-4
11041138 { Phi4ModelName , ModelEncoding . Cl100kBase } ,
@@ -1126,6 +1160,32 @@ private static ModelEncoding GetModelEncoding(string modelName)
11261160 return encoder ;
11271161 }
11281162
1163+ private static Dictionary < string , int > CreateHarmonyEncodingSpecialTokens ( ) =>
1164+ new Dictionary < string , int >
1165+ {
1166+ { StartOfText , 199998 } ,
1167+ { EndOfText , 199999 } ,
1168+ { $ "{ ReservedPrefix } 200000|>", 200000 } ,
1169+ { $ "{ ReservedPrefix } 200001|>", 200001 } ,
1170+ { Return , 200002 } ,
1171+ { Constrain , 200003 } ,
1172+ { $ "{ ReservedPrefix } 200004|>", 200004 } ,
1173+ { Channel , 200005 } ,
1174+ { Start , 200006 } ,
1175+ { End , 200007 } ,
1176+ { Message , 200008 } ,
1177+ { $ "{ ReservedPrefix } 200009|>", 200009 } ,
1178+ { $ "{ ReservedPrefix } 200010|>", 200010 } ,
1179+ { $ "{ ReservedPrefix } 200011|>", 200011 } ,
1180+ { Call , 200012 } ,
1181+ { $ "{ ReservedPrefix } 200013|>", 200013 } ,
1182+ { $ "{ ReservedPrefix } 200014|>", 200014 } ,
1183+ { $ "{ ReservedPrefix } 200015|>", 200015 } ,
1184+ { $ "{ ReservedPrefix } 200016|>", 200016 } ,
1185+ { $ "{ ReservedPrefix } 200017|>", 200017 } ,
1186+ { EndOfPrompt , 200018 } ,
1187+ } ;
1188+
11291189 private static ( Dictionary < string , int > SpecialTokens , Regex Regex , string VocabFile , Type ? DataType , string PackageName ) GetTiktokenConfigurations ( string modelName ) => GetTiktokenConfigurations ( GetModelEncoding ( modelName ) , modelName ) ;
11301190
11311191 private static ( Dictionary < string , int > SpecialTokens , Regex Regex , string VocabFile , Type ? DataType , string PackageName ) GetTiktokenConfigurations ( ModelEncoding modelEncoding , string ? modelName = null )
@@ -1157,6 +1217,9 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
11571217 case ModelEncoding . R50kBase :
11581218 return ( new Dictionary < string , int > { { EndOfText , 50256 } } , P50kBaseRegex ( ) , R50RanksFile , Type . GetType ( R50kBaseTypeName ) , R50kBasePackageName ) ;
11591219
1220+ case ModelEncoding . O200kHarmony :
1221+ return ( CreateHarmonyEncodingSpecialTokens ( ) , O200kBaseRegex ( ) , O200kBaseFile , Type . GetType ( O200kBaseTypeName ) , O200kBasePackageName ) ;
1222+
11601223 default :
11611224 throw new NotSupportedException ( $ "The model '{ modelName ?? modelEncoding . ToString ( ) } ' is not supported.") ;
11621225 }
@@ -1179,6 +1242,7 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
11791242 internal const string P50kEditEncodingName = "p50k_edit" ;
11801243 internal const string R50kBaseEncodingName = "r50k_base" ;
11811244 internal const string O200kBaseEncodingName = "o200k_base" ;
1245+ internal const string O200kHarmonyEncodingName = "o200k_harmony" ;
11821246
11831247 internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase" ;
11841248 internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2" ;
@@ -1474,6 +1538,10 @@ public static TiktokenTokenizer CreateForEncoding(string encodingName, IReadOnly
14741538 {
14751539 modelEncoding = ModelEncoding . O200kBase ;
14761540 }
1541+ else if ( encodingName . Equals ( O200kHarmonyEncodingName , StringComparison . OrdinalIgnoreCase ) )
1542+ {
1543+ modelEncoding = ModelEncoding . O200kHarmony ;
1544+ }
14771545 else if ( encodingName . Equals ( P50kBaseEncodingName , StringComparison . OrdinalIgnoreCase ) )
14781546 {
14791547 modelEncoding = ModelEncoding . P50kBase ;
0 commit comments