6
6
using System . IO ;
7
7
using System . Net ;
8
8
using System . Reflection ;
9
+ using System . Net . Http ;
10
+ using System . Threading . Tasks ;
9
11
10
12
namespace Microsoft . DeepDev
11
13
{
@@ -69,13 +71,15 @@ public static class TokenizerBuilder
69
71
private const string FIM_SUFFIX = "<|fim_suffix|>" ;
70
72
private const string ENDOFPROMPT = "<|endofprompt|>" ;
71
73
74
+ private static readonly HttpClient _httpClient = new HttpClient ( ) ;
75
+
72
76
/// <summary>
73
77
/// Create tokenizer based on model name and extra special tokens
74
78
/// </summary>
75
79
/// <param name="modelName">Model name</param>
76
80
/// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the model</param>
77
81
/// <returns>The tokenizer</returns>
78
- public static ITokenizer CreateByModelName ( string modelName , IReadOnlyDictionary < string , int > ? extraSpecialTokens = null )
82
+ public static async Task < ITokenizer > CreateByModelNameAsync ( string modelName , IReadOnlyDictionary < string , int > ? extraSpecialTokens = null )
79
83
{
80
84
var encoder = "" ;
81
85
if ( ! MODEL_TO_ENCODING . TryGetValue ( modelName , out encoder ) )
@@ -89,7 +93,7 @@ public static ITokenizer CreateByModelName(string modelName, IReadOnlyDictionary
89
93
}
90
94
}
91
95
}
92
- return CreateByEncoderName ( encoder , extraSpecialTokens ) ;
96
+ return await CreateByEncoderNameAsync ( encoder , extraSpecialTokens ) ;
93
97
94
98
}
95
99
@@ -100,7 +104,7 @@ public static ITokenizer CreateByModelName(string modelName, IReadOnlyDictionary
100
104
/// <param name="extraSpecialTokens">Extra special tokens other than the built-in ones for the encoder</param>
101
105
/// <returns>The tokenizer</returns>
102
106
/// <exception cref="NotImplementedException">Throws if the encoder is not supported</exception>
103
- public static ITokenizer CreateByEncoderName ( string encoderName , IReadOnlyDictionary < string , int > ? extraSpecialTokens = null )
107
+ public static async Task < ITokenizer > CreateByEncoderNameAsync ( string encoderName , IReadOnlyDictionary < string , int > ? extraSpecialTokens = null )
104
108
{
105
109
switch ( encoderName )
106
110
{
@@ -119,7 +123,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
119
123
specialTokens = specialTokens . Concat ( extraSpecialTokens )
120
124
. ToDictionary ( pair => pair . Key , pair => pair . Value ) ;
121
125
}
122
- return CreateTokenizer ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
126
+ return await CreateTokenizerAsync ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
123
127
case "p50k_base" :
124
128
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ;
125
129
mergeableRanksFileUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" ;
@@ -131,7 +135,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
131
135
specialTokens = specialTokens . Concat ( extraSpecialTokens )
132
136
. ToDictionary ( pair => pair . Key , pair => pair . Value ) ;
133
137
}
134
- return CreateTokenizer ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
138
+ return await CreateTokenizerAsync ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
135
139
case "p50k_edit" :
136
140
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ;
137
141
mergeableRanksFileUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" ;
@@ -146,7 +150,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
146
150
specialTokens = specialTokens . Concat ( extraSpecialTokens )
147
151
. ToDictionary ( pair => pair . Key , pair => pair . Value ) ;
148
152
}
149
- return CreateTokenizer ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
153
+ return await CreateTokenizerAsync ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
150
154
case "r50k_base" :
151
155
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ;
152
156
mergeableRanksFileUrl = @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" ;
@@ -158,7 +162,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
158
162
specialTokens = specialTokens . Concat ( extraSpecialTokens )
159
163
. ToDictionary ( pair => pair . Key , pair => pair . Value ) ;
160
164
}
161
- return CreateTokenizer ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
165
+ return await CreateTokenizerAsync ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
162
166
case "gpt2" :
163
167
regexPatternStr = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ;
164
168
mergeableRanksFileUrl = @"https://pythia.blob.core.windows.net/public/encoding/gpt2.tiktoken" ;
@@ -170,7 +174,7 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
170
174
specialTokens = specialTokens . Concat ( extraSpecialTokens )
171
175
. ToDictionary ( pair => pair . Key , pair => pair . Value ) ;
172
176
}
173
- return CreateTokenizer ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
177
+ return await CreateTokenizerAsync ( regexPatternStr , mergeableRanksFileUrl , specialTokens ) ;
174
178
default :
175
179
throw new NotImplementedException ( $ "Doesn't support this encoder [{ encoderName } ]") ;
176
180
@@ -186,10 +190,9 @@ public static ITokenizer CreateByEncoderName(string encoderName, IReadOnlyDictio
186
190
/// <param name="mergeableRanksFileUrl">BPE rank file</param>
187
191
/// <param name="specialTokens">Special tokens mapping</param>
188
192
/// <returns>The tokenizer</returns>
189
- private static ITokenizer CreateTokenizer ( string regexPatternStr , string mergeableRanksFileUrl , Dictionary < string , int > specialTokens )
193
+ private static async Task < ITokenizer > CreateTokenizerAsync ( string regexPatternStr , string mergeableRanksFileUrl , Dictionary < string , int > specialTokens )
190
194
{
191
- using ( WebClient client = new WebClient ( ) )
192
- using ( Stream stream = client . OpenRead ( mergeableRanksFileUrl ) )
195
+ using ( Stream stream = await _httpClient . GetStreamAsync ( mergeableRanksFileUrl ) )
193
196
{
194
197
return CreateTokenizer ( stream , specialTokens , regexPatternStr ) ;
195
198
}
0 commit comments