Skip to content

Commit de4eba2

Browse files
authored
Support gpt-5.1 model in Tiktoken tokenizer (#7556)
* Support gpt-5.1 model in Tiktoken tokenizer * Fix spaces
1 parent 8f9674f commit de4eba2

File tree

2 files changed

+14
-8
lines changed

2 files changed

+14
-8
lines changed

src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,13 +1044,14 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
10441044
( "o4-mini-", ModelEncoding.O200kBase ), // e.g. o4-mini
10451045

10461046
// chat
1047-
( "gpt-5-", ModelEncoding.O200kBase),
1048-
( "gpt-4.1-", ModelEncoding.O200kBase), // e.g., gpt-4.1-mini
1049-
( "gpt-4.5-", ModelEncoding.O200kBase), // e.g., gpt-4.5
1050-
( "gpt-4o-", ModelEncoding.O200kBase), // e.g., gpt-4o-2024-05-13
1051-
( "chatgpt-4o-", ModelEncoding.O200kBase),
1052-
( "gpt-4-", ModelEncoding.Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k
1053-
( "gpt-3.5-", ModelEncoding.Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc.
1047+
( "gpt-5.1-", ModelEncoding.O200kBase ),
1048+
( "gpt-5-", ModelEncoding.O200kBase ),
1049+
( "gpt-4.1-", ModelEncoding.O200kBase ), // e.g., gpt-4.1-mini
1050+
( "gpt-4.5-", ModelEncoding.O200kBase ), // e.g., gpt-4.5
1051+
( "gpt-4o-", ModelEncoding.O200kBase ), // e.g., gpt-4o-2024-05-13
1052+
( "chatgpt-4o-", ModelEncoding.O200kBase ),
1053+
( "gpt-4-", ModelEncoding.Cl100kBase ), // e.g., gpt-4-0314, etc., plus gpt-4-32k
1054+
( "gpt-3.5-", ModelEncoding.Cl100kBase ), // e.g, gpt-3.5-turbo-0301, -0401, etc.
10541055
( "gpt-35-", ModelEncoding.Cl100kBase ), // Azure deployment name
10551056
( "gpt-oss-", ModelEncoding.O200kHarmony ),
10561057

@@ -1071,6 +1072,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
10711072
{ "o4-mini", ModelEncoding.O200kBase },
10721073

10731074
// chat
1075+
{ "gpt-5.1", ModelEncoding.O200kBase },
10741076
{ "gpt-5", ModelEncoding.O200kBase },
10751077
{ "gpt-4.1", ModelEncoding.O200kBase },
10761078
{ "gpt-4o", ModelEncoding.O200kBase },

test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ public class TiktokenTests
3636
public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
3737
public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
3838
public static Tokenizer GPT5 { get; } = TiktokenTokenizer.CreateForModel("gpt-5");
39+
public static Tokenizer GPT5_1 { get; } = TiktokenTokenizer.CreateForModel("gpt-5.1");
3940
public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4");
4041
public static TiktokenTokenizer GptOss { get; } = TiktokenTokenizer.CreateForModel("gpt-oss-20b");
4142

@@ -286,7 +287,7 @@ public void TestEncode5()
286287
[Fact]
287288
public void TestEncodeO200kBaseEncoding()
288289
{
289-
foreach (TiktokenTokenizer tokenizer in new[] { GPT4o, GptOss, GPT5 })
290+
foreach (TiktokenTokenizer tokenizer in new[] { GPT4o, GptOss, GPT5, GPT5_1 })
290291
{
291292
string text = ReadAndSanitizeFile("./Data/lib.rs.txt");
292293
IReadOnlyList<int> encoded = tokenizer.EncodeToIds(text);
@@ -415,6 +416,8 @@ public void TestEncodeR50kBase()
415416
[InlineData("gpt-4o-")]
416417
[InlineData("gpt-5")]
417418
[InlineData("gpt-5-chat")]
419+
[InlineData("gpt-5.1")]
420+
[InlineData("gpt-5.1-mini")]
418421
[InlineData("chatgpt-4o-")]
419422
[InlineData("gpt-4")]
420423
[InlineData("gpt-4-")]
@@ -533,6 +536,7 @@ public void TestEncodingNamesNegativeCases()
533536
[InlineData("gpt-4.1")]
534537
[InlineData("gpt-4o")]
535538
[InlineData("gpt-5")]
539+
[InlineData("gpt-5.1")]
536540
[InlineData("o1")]
537541
[InlineData("o3")]
538542
[InlineData("o4-mini")]

0 commit comments

Comments
 (0)