Skip to content

Commit 3834de5

Browse files
nt-7shengyfu
andauthored
Refactor: Migrate from deprecated WebClient to async HttpClient (#6)
Signed-off-by: Shengyu Fu <[email protected]> Co-authored-by: Shengyu Fu <[email protected]>
1 parent 8f45ca9 commit 3834de5

File tree

5 files changed

+37
-27
lines changed

5 files changed

+37
-27
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ var specialTokens = new Dictionary<string, int>{
2020
{ IM_START, 100264},
2121
{ IM_END, 100265},
2222
};
23-
tokenizer = TokenizerBuilder.CreateByModelName("gpt-4", specialTokens);
23+
tokenizer = await TokenizerBuilder.CreateByModelNameAsync("gpt-4", specialTokens);
2424

2525
var text = "<|im_start|>Hello World<|im_end|>";
2626
var encoded = tokenizer.Encode(text, new HashSet<string>(specialTokens.Keys));

Tokenizer_C#/PerfBenchmark/Program.cs

+6-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ public class Tokenization
99
{
1010
private List<String> Words;
1111
private readonly string data;
12-
private readonly ITokenizer Tokenizer = TokenizerBuilder.CreateByModelName("gpt-4");
12+
private ITokenizer Tokenizer;
1313

1414
public Tokenization()
1515
{
@@ -31,6 +31,11 @@ public Tokenization()
3131
[Benchmark]
3232
public List<int> Encode() => Tokenizer.Encode(data, new HashSet<string>());
3333

34+
[GlobalSetup]
35+
public async Task GlobalSetup()
36+
{
37+
this.Tokenizer = await TokenizerBuilder.CreateByModelNameAsync("gpt-4");
38+
}
3439
}
3540

3641
public class Program

Tokenizer_C#/Tokenizer/Program.cs

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System;
22
using System.Collections.Generic;
3+
using System.Threading.Tasks;
34

45
namespace Microsoft.DeepDev
56
{
@@ -10,11 +11,11 @@ internal class Program
1011
/// Example usage: Tokenizer.exe "gpt-3.5-turbo" "hello, world"
1112
/// </summary>
1213
/// <param name="args">args[0] -- model name, args[1] -- string to be encoded</param>
13-
static void Main(string[] args)
14+
static async Task Main(string[] args)
1415
{
1516
try
1617
{
17-
var tokenizer = TokenizerBuilder.CreateByModelName(args[0]);
18+
var tokenizer = await TokenizerBuilder.CreateByModelNameAsync(args[0]);
1819
Console.WriteLine($"Tokenizing: [{args[1]}]");
1920
var encoded = tokenizer.Encode(args[1], new List<string>());
2021
for (var i = 0; i < encoded.Count; i++)

Tokenizer_C#/TokenizerLib/TokenizerBuilder.cs

+14-11
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
using System.IO;
77
using System.Net;
88
using System.Reflection;
9+
using System.Net.Http;
10+
using System.Threading.Tasks;
911

1012
namespace Microsoft.DeepDev
1113
{
@@ -69,13 +71,15 @@ public static class TokenizerBuilder
6971
private const string FIM_SUFFIX = "<|fim_suffix|>";
7072
private const string ENDOFPROMPT = "<|endofprompt|>";
7173

74+
private static readonly HttpClient _httpClient = new HttpClient();
75+
7276
/// <summary>
7377
/// Create tokenizer based on model name and extra special tokens
7478
/// </summary>
7579
/// <param name="modelName">Model name</param>
7680
/// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the model</param>
7781
/// <returns>The tokenizer</returns>
78-
public static ITokenizer CreateByModelName(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null)
82+
public static async Task<ITokenizer> CreateByModelNameAsync(string modelName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null)
7983
{
8084
var encoder = "";
8185
if (!MODEL_TO_ENCODING.TryGetValue(modelName, out encoder))
@@ -89,7 +93,7 @@ public static ITokenizer CreateByModelName(string modelName, IReadOnlyDictionary
8993
}
9094
}
9195
}
92-
return CreateByEncoderName(encoder, extraSpecialTokens);
96+
return await CreateByEncoderNameAsync(encoder, extraSpecialTokens);
9397

9498
}
9599

@@ -100,7 +104,7 @@ public static ITokenizer CreateByModelName(string modelName, IReadOnlyDictionary
100104
/// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the encoder</param>
101105
/// <returns>The tokenizer</returns>
102106
/// <exception cref="NotImplementedException">Throws if the encoder is not supported</exception>
103-
public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null)
107+
public static async Task<ITokenizer> CreateByEncoderNameAsync(string encoderName, IReadOnlyDictionary<string, int>? extraSpecialTokens = null)
104108
{
105109
switch (encoderName)
106110
{
@@ -119,7 +123,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
119123
specialTokens = specialTokens.Concat(extraSpecialTokens)
120124
.ToDictionary(pair => pair.Key, pair => pair.Value);
121125
}
122-
return CreateTokenizer(regexPatternStr, mergeableRanksFileUrl, specialTokens);
126+
return await CreateTokenizerAsync(regexPatternStr, mergeableRanksFileUrl, specialTokens);
123127
case "p50k_base":
124128
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
125129
mergeableRanksFileUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken";
@@ -131,7 +135,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
131135
specialTokens = specialTokens.Concat(extraSpecialTokens)
132136
.ToDictionary(pair => pair.Key, pair => pair.Value);
133137
}
134-
return CreateTokenizer(regexPatternStr, mergeableRanksFileUrl, specialTokens);
138+
return await CreateTokenizerAsync(regexPatternStr, mergeableRanksFileUrl, specialTokens);
135139
case "p50k_edit":
136140
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
137141
mergeableRanksFileUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken";
@@ -146,7 +150,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
146150
specialTokens = specialTokens.Concat(extraSpecialTokens)
147151
.ToDictionary(pair => pair.Key, pair => pair.Value);
148152
}
149-
return CreateTokenizer(regexPatternStr, mergeableRanksFileUrl, specialTokens);
153+
return await CreateTokenizerAsync(regexPatternStr, mergeableRanksFileUrl, specialTokens);
150154
case "r50k_base":
151155
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
152156
mergeableRanksFileUrl = @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken";
@@ -158,7 +162,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
158162
specialTokens = specialTokens.Concat(extraSpecialTokens)
159163
.ToDictionary(pair => pair.Key, pair => pair.Value);
160164
}
161-
return CreateTokenizer(regexPatternStr, mergeableRanksFileUrl, specialTokens);
165+
return await CreateTokenizerAsync(regexPatternStr, mergeableRanksFileUrl, specialTokens);
162166
case "gpt2":
163167
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
164168
mergeableRanksFileUrl = @"https://pythia.blob.core.windows.net/public/encoding/gpt2.tiktoken";
@@ -170,7 +174,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
170174
specialTokens = specialTokens.Concat(extraSpecialTokens)
171175
.ToDictionary(pair => pair.Key, pair => pair.Value);
172176
}
173-
return CreateTokenizer(regexPatternStr, mergeableRanksFileUrl, specialTokens);
177+
return await CreateTokenizerAsync(regexPatternStr, mergeableRanksFileUrl, specialTokens);
174178
default:
175179
throw new NotImplementedException($"Doesn't support this encoder [{encoderName}]");
176180

@@ -186,10 +190,9 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
186190
/// <param name="mergeableRanksFileUrl">BPE rank file</param>
187191
/// <param name="specialTokens">Special tokens mapping</param>
188192
/// <returns>The tokenizer</returns>
189-
private static ITokenizer CreateTokenizer(string regexPatternStr, string mergeableRanksFileUrl, Dictionary<string, int> specialTokens)
193+
private static async Task<ITokenizer> CreateTokenizerAsync(string regexPatternStr, string mergeableRanksFileUrl, Dictionary<string, int> specialTokens)
190194
{
191-
using (WebClient client = new WebClient())
192-
using (Stream stream = client.OpenRead(mergeableRanksFileUrl))
195+
using (Stream stream = await _httpClient.GetStreamAsync(mergeableRanksFileUrl))
193196
{
194197
return CreateTokenizer(stream, specialTokens, regexPatternStr);
195198
}

Tokenizer_C#/TokenizerTest/TikTokenizerUnitTest.cs

+13-12
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,18 @@
55
using Newtonsoft.Json;
66
using System.Collections.Generic;
77
using System.IO;
8-
8+
using System.Threading.Tasks;
99

1010
namespace TokenizerTest
1111
{
1212
[TestClass]
1313
public class TikTokenizerUnitTest
1414
{
15-
private readonly ITokenizer Tokenizer;
16-
private readonly ITokenizer Tokenizer_gpt2;
17-
private readonly ITokenizer Tokenizer_p50k_base;
18-
private readonly ITokenizer Tokenizer_r50k_base;
19-
private readonly ITokenizer Tokenizer_p50k_edit;
15+
private ITokenizer Tokenizer;
16+
private ITokenizer Tokenizer_gpt2;
17+
private ITokenizer Tokenizer_p50k_base;
18+
private ITokenizer Tokenizer_r50k_base;
19+
private ITokenizer Tokenizer_p50k_edit;
2020

2121
const string IM_START = "<|im_start|>";
2222
const string IM_END = "<|im_end|>";
@@ -26,13 +26,14 @@ public class TikTokenizerUnitTest
2626
{ IM_END, 100265},
2727
};
2828

29-
public TikTokenizerUnitTest()
29+
[TestInitialize]
30+
public async Task TikTokenizerUnitTestInitialize()
3031
{
31-
Tokenizer = TokenizerBuilder.CreateByModelName("gpt-4", SpecialTokens);
32-
Tokenizer_gpt2 = TokenizerBuilder.CreateByEncoderName("gpt2");
33-
Tokenizer_p50k_base = TokenizerBuilder.CreateByEncoderName("p50k_base");
34-
Tokenizer_r50k_base = TokenizerBuilder.CreateByEncoderName("r50k_base");
35-
Tokenizer_p50k_edit = TokenizerBuilder.CreateByEncoderName("p50k_edit");
32+
Tokenizer = await TokenizerBuilder.CreateByModelNameAsync("gpt-4", SpecialTokens);
33+
Tokenizer_gpt2 = await TokenizerBuilder.CreateByEncoderNameAsync("gpt2");
34+
Tokenizer_p50k_base = await TokenizerBuilder.CreateByEncoderNameAsync("p50k_base");
35+
Tokenizer_r50k_base = await TokenizerBuilder.CreateByEncoderNameAsync("r50k_base");
36+
Tokenizer_p50k_edit = await TokenizerBuilder.CreateByEncoderNameAsync("p50k_edit");
3637
}
3738

3839
[TestMethod]

0 commit comments

Comments
 (0)