8
8
using System . Diagnostics ;
9
9
using System . IO ;
10
10
using System . Linq ;
11
- using System . Runtime . CompilerServices ;
12
- using System . Text ;
13
11
using System . Text . Json ;
14
- using System . Text . Json . Serialization ;
15
12
16
13
namespace Microsoft . ML . Tokenizers
17
14
{
@@ -27,7 +24,7 @@ public sealed class EnglishRoberta : Model
27
24
private readonly IReadOnlyDictionary < char , char > _byteToUnicode ;
28
25
private readonly IReadOnlyDictionary < char , char > _unicodeToByte ;
29
26
private readonly string [ ] _charToString ;
30
- private readonly Cache < string , IReadOnlyList < Token > > _cache ;
27
+ private readonly Cache < string , List < Token > > _cache ;
31
28
32
29
/// <summary>
33
30
/// Construct tokenizer object to use with the English Robert model.
@@ -72,7 +69,7 @@ public EnglishRoberta(string vocabularyPath, string mergePath, string highestOcc
72
69
}
73
70
74
71
_unicodeToByte = _byteToUnicode . Reverse ( ) ;
75
- _cache = new Cache < string , IReadOnlyList < Token > > ( ) ;
72
+ _cache = new Cache < string , List < Token > > ( ) ;
76
73
}
77
74
78
75
/// <summary>
@@ -110,7 +107,7 @@ public EnglishRoberta(Stream vocabularyStream, Stream mergeStream, Stream highes
110
107
}
111
108
112
109
_unicodeToByte = _byteToUnicode . Reverse ( ) ;
113
- _cache = new Cache < string , IReadOnlyList < Token > > ( ) ;
110
+ _cache = new Cache < string , List < Token > > ( ) ;
114
111
}
115
112
116
113
//
@@ -226,17 +223,17 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
226
223
{
227
224
ArrayPool < char > . Shared . Return ( token ) ;
228
225
ArrayPool < int > . Shared . Return ( indexMapping ) ;
229
- return Bpe . EmptyTokensList ;
226
+ return Array . Empty < Token > ( ) ;
230
227
}
231
228
232
- if ( _cache . TryGet ( sequence , out IReadOnlyList < Token > ? hit ) )
229
+ if ( _cache . TryGet ( sequence , out List < Token > ? hit ) )
233
230
{
234
231
ArrayPool < char > . Shared . Return ( token ) ;
235
232
ArrayPool < int > . Shared . Return ( indexMapping ) ;
236
233
return ModifyTokenListOffsets ( hit , indexMapping ) ;
237
234
}
238
235
239
- IReadOnlyList < Token > result = EncodeToTokens ( token . AsSpan ( ) . Slice ( 0 , newTokenIndex ) , indexMapping ) ;
236
+ List < Token > result = EncodeToTokens ( token . AsSpan ( ) . Slice ( 0 , newTokenIndex ) , indexMapping ) ;
240
237
_cache . Set ( sequence , result ) ;
241
238
ArrayPool < char > . Shared . Return ( token ) ;
242
239
ArrayPool < int > . Shared . Return ( indexMapping ) ;
@@ -261,7 +258,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
261
258
262
259
private int TokenizeToIds ( string sequence , IList < int > ? accumulatedIds )
263
260
{
264
- if ( _cache . TryGet ( sequence , out IReadOnlyList < Token > ? hit ) )
261
+ if ( _cache . TryGet ( sequence , out List < Token > ? hit ) )
265
262
{
266
263
if ( accumulatedIds is not null )
267
264
{
@@ -299,7 +296,7 @@ private int TokenizeToIds(string sequence, IList<int>? accumulatedIds)
299
296
return 0 ;
300
297
}
301
298
302
- IReadOnlyList < Token > result = EncodeToTokens ( token . Slice ( 0 , newTokenIndex ) , indexMapping ) ;
299
+ List < Token > result = EncodeToTokens ( token . Slice ( 0 , newTokenIndex ) , indexMapping ) ;
303
300
_cache . Set ( sequence , result ) ;
304
301
return result . Count ;
305
302
}
0 commit comments