@@ -28,30 +28,6 @@ public protocol PreTokenizer {
2828 /// - Returns: An array of pre-tokenized text chunks
2929 func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ]
3030
31- /// Pre-tokenizes multiple text strings.
32- ///
33- /// - Parameters:
34- /// - texts: The input texts to pre-tokenize
35- /// - options: Options controlling the pre-tokenization behavior
36- /// - Returns: An array of pre-tokenized text chunks from all inputs
37- func preTokenize( texts: [ String ] , options: PreTokenizerOptions ) -> [ String ]
38-
39- /// Function call syntax for pre-tokenizing multiple texts.
40- ///
41- /// - Parameters:
42- /// - texts: The input texts to pre-tokenize
43- /// - options: Options controlling the pre-tokenization behavior
44- /// - Returns: An array of pre-tokenized text chunks
45- func callAsFunction( texts: [ String ] , options: PreTokenizerOptions ) -> [ String ]
46-
47- /// Function call syntax for pre-tokenizing a single text.
48- ///
49- /// - Parameters:
50- /// - text: The input text to pre-tokenize
51- /// - options: Options controlling the pre-tokenization behavior
52- /// - Returns: An array of pre-tokenized text chunks
53- func callAsFunction( text: String , options: PreTokenizerOptions ) -> [ String ]
54-
5531 /// Initializes the pre-tokenizer from configuration.
5632 ///
5733 /// - Parameter config: The configuration for this pre-tokenizer
@@ -60,6 +36,11 @@ public protocol PreTokenizer {
6036}
6137
6238extension PreTokenizer {
39+ /// Convenience with default parameter values for the protocol requirement.
40+ func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection] ) -> [ String ] {
41+ preTokenize ( text: text, options: options)
42+ }
43+
6344 func preTokenize( texts: [ String ] , options: PreTokenizerOptions = [ . firstSection] ) -> [ String ] {
6445 texts. flatMap { preTokenize ( text: $0, options: options) }
6546 }
@@ -114,7 +95,7 @@ class BertPreTokenizer: PreTokenizer {
11495 re = " [^ \\ s \( punctuationRegex) ]+|[ \( punctuationRegex) ] "
11596 }
11697
117- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
98+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
11899 text. ranges ( of: re) . map { String ( text [ $0] ) }
119100 }
120101}
@@ -129,7 +110,7 @@ class PreTokenizerSequence: PreTokenizer {
129110 preTokenizers = try configs. compactMap { try PreTokenizerFactory . fromConfig ( config: $0) }
130111 }
131112
132- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
113+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
133114 preTokenizers. reduce ( [ text] ) { current, preTokenizer in
134115 preTokenizer ( texts: current, options: options)
135116 }
@@ -143,7 +124,7 @@ class WhitespacePreTokenizer: PreTokenizer {
143124 re = #"\S+"#
144125 }
145126
146- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
127+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
147128 text. ranges ( of: re) . map { String ( text [ $0] ) }
148129 }
149130}
@@ -183,7 +164,7 @@ class MetaspacePreTokenizer: PreTokenizer {
183164
184165 /// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
185166 /// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
186- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
167+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
187168 let normalized = text. replacingOccurrences ( of: " " , with: stringReplacement)
188169
189170 // We add a prefix space if:
@@ -222,7 +203,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
222203 useRegex = config. useRegex. boolean ( or: true )
223204 }
224205
225- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
206+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
226207 // Split on whitespace and punctuation
227208 let tokens = useRegex ? text. ranges ( of: RE) . map { String ( text [ $0] ) } : [ text]
228209 return tokens. map { token in
@@ -243,7 +224,7 @@ class PunctuationPreTokenizer: PreTokenizer {
243224 re = " [^ \( punctuationRegex) ]+|[ \( punctuationRegex) ]+ "
244225 }
245226
246- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
227+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
247228 // Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
248229 text. ranges ( of: re) . map { String ( text [ $0] ) }
249230 }
@@ -257,7 +238,7 @@ class DigitsPreTokenizer: PreTokenizer {
257238 re = " [^ \\ d]+| \\ d \( individualDigits ? " " : " + " ) "
258239 }
259240
260- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
241+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
261242 text. ranges ( of: re) . map { String ( text [ $0] ) }
262243 }
263244}
@@ -271,7 +252,7 @@ class SplitPreTokenizer: PreTokenizer {
271252 invert = config. invert. boolean ( or: false )
272253 }
273254
274- func preTokenize( text: String , options: PreTokenizerOptions = [ . firstSection ] ) -> [ String ] {
255+ func preTokenize( text: String , options: PreTokenizerOptions ) -> [ String ] {
275256 guard let pattern else { return [ text] }
276257 return pattern. split ( text, invert: invert)
277258 }
0 commit comments