Skip to content

Commit 4b94e69

Browse files
committed
Clean up protocol requirements: remove redundant overloads and defaults
Move convenience methods (callAsFunction, bulk wrappers, short-parameter versions) from protocol requirements to extensions across Tokenizer, TokenizingModel, Normalizer, Decoder, PostProcessor, and PreTokenizer. Derive bosTokenId/eosTokenId/unknownTokenId from token strings on the Tokenizer protocol. Remove redundant default parameter values from all concrete implementations.
1 parent dbf45b1 commit 4b94e69

File tree

5 files changed

+48
-143
lines changed

5 files changed

+48
-143
lines changed

Sources/Tokenizers/Decoder.swift

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,6 @@ public protocol Decoder {
1414
/// - Returns: An array of decoded text components
1515
func decode(tokens: [String]) -> [String]
1616

17-
/// Function call syntax for token decoding.
18-
///
19-
/// - Parameter tokens: The token strings to decode
20-
/// - Returns: An array of decoded text components
21-
func callAsFunction(tokens: [String]) -> [String]
22-
2317
/// Initializes the decoder from configuration.
2418
///
2519
/// - Parameter config: The configuration for this decoder

Sources/Tokenizers/Normalizer.swift

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,6 @@ public protocol Normalizer {
1414
/// - Returns: The normalized text
1515
func normalize(text: String) -> String
1616

17-
/// Function call syntax for text normalization.
18-
///
19-
/// - Parameter text: The input text to normalize
20-
/// - Returns: The normalized text
21-
func callAsFunction(text: String) -> String
22-
2317
/// Initializes the normalizer from configuration.
2418
///
2519
/// - Parameter config: The configuration for this normalizer

Sources/Tokenizers/PostProcessor.swift

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,6 @@ public protocol PostProcessor {
1717
/// - Returns: The post-processed token sequence
1818
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String]
1919

20-
/// Function call syntax for post-processing.
21-
///
22-
/// - Parameters:
23-
/// - tokens: The primary sequence of tokens to process
24-
/// - tokensPair: An optional secondary sequence
25-
/// - addSpecialTokens: Whether to add special tokens
26-
/// - Returns: The post-processed token sequence
27-
func callAsFunction(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String]
28-
2920
/// Initializes the post-processor from configuration.
3021
///
3122
/// - Parameter config: The configuration for this post-processor
@@ -34,6 +25,11 @@ public protocol PostProcessor {
3425
}
3526

3627
extension PostProcessor {
28+
/// Convenience with default parameter values for the protocol requirement.
29+
func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
30+
postProcess(tokens: tokens, tokensPair: tokensPair, addSpecialTokens: addSpecialTokens)
31+
}
32+
3733
func callAsFunction(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
3834
postProcess(tokens: tokens, tokensPair: tokensPair, addSpecialTokens: addSpecialTokens)
3935
}
@@ -79,7 +75,7 @@ class TemplateProcessing: PostProcessor {
7975
self.pair = pair
8076
}
8177

82-
func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
78+
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
8379
let config = tokensPair == nil ? single : pair
8480

8581
var toReturn: [String] = []
@@ -100,7 +96,7 @@ class TemplateProcessing: PostProcessor {
10096

10197
class ByteLevelPostProcessor: PostProcessor {
10298
required init(config: Config) {}
103-
func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] { tokens }
99+
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] { tokens }
104100
}
105101

106102
class RobertaProcessing: PostProcessor {
@@ -124,7 +120,7 @@ class RobertaProcessing: PostProcessor {
124120
addPrefixSpace = config.addPrefixSpace.boolean(or: true)
125121
}
126122

127-
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool = true) -> [String] {
123+
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
128124
var outTokens = tokens
129125
var tokensPair = tokensPair
130126
if trimOffset {
@@ -183,7 +179,7 @@ class BertProcessing: PostProcessor {
183179
self.cls = cls
184180
}
185181

186-
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool = true) -> [String] {
182+
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
187183
guard addSpecialTokens else { return tokens + (tokensPair ?? []) }
188184

189185
var outTokens = [cls.1] + tokens + [sep.1]
@@ -206,7 +202,7 @@ class SequenceProcessing: PostProcessor {
206202
processors = try processorConfigs.compactMap { try PostProcessorFactory.fromConfig(config: $0) }
207203
}
208204

209-
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool = true) -> [String] {
205+
func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
210206
var currentTokens = tokens
211207
var currentTokensPair = tokensPair
212208

Sources/Tokenizers/PreTokenizer.swift

Lines changed: 13 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,6 @@ public protocol PreTokenizer {
2828
/// - Returns: An array of pre-tokenized text chunks
2929
func preTokenize(text: String, options: PreTokenizerOptions) -> [String]
3030

31-
/// Pre-tokenizes multiple text strings.
32-
///
33-
/// - Parameters:
34-
/// - texts: The input texts to pre-tokenize
35-
/// - options: Options controlling the pre-tokenization behavior
36-
/// - Returns: An array of pre-tokenized text chunks from all inputs
37-
func preTokenize(texts: [String], options: PreTokenizerOptions) -> [String]
38-
39-
/// Function call syntax for pre-tokenizing multiple texts.
40-
///
41-
/// - Parameters:
42-
/// - texts: The input texts to pre-tokenize
43-
/// - options: Options controlling the pre-tokenization behavior
44-
/// - Returns: An array of pre-tokenized text chunks
45-
func callAsFunction(texts: [String], options: PreTokenizerOptions) -> [String]
46-
47-
/// Function call syntax for pre-tokenizing a single text.
48-
///
49-
/// - Parameters:
50-
/// - text: The input text to pre-tokenize
51-
/// - options: Options controlling the pre-tokenization behavior
52-
/// - Returns: An array of pre-tokenized text chunks
53-
func callAsFunction(text: String, options: PreTokenizerOptions) -> [String]
54-
5531
/// Initializes the pre-tokenizer from configuration.
5632
///
5733
/// - Parameter config: The configuration for this pre-tokenizer
@@ -60,6 +36,11 @@ public protocol PreTokenizer {
6036
}
6137

6238
extension PreTokenizer {
39+
/// Convenience with default parameter values for the protocol requirement.
40+
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
41+
preTokenize(text: text, options: options)
42+
}
43+
6344
func preTokenize(texts: [String], options: PreTokenizerOptions = [.firstSection]) -> [String] {
6445
texts.flatMap { preTokenize(text: $0, options: options) }
6546
}
@@ -114,7 +95,7 @@ class BertPreTokenizer: PreTokenizer {
11495
re = "[^\\s\(punctuationRegex)]+|[\(punctuationRegex)]"
11596
}
11697

117-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
98+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
11899
text.ranges(of: re).map { String(text[$0]) }
119100
}
120101
}
@@ -129,7 +110,7 @@ class PreTokenizerSequence: PreTokenizer {
129110
preTokenizers = try configs.compactMap { try PreTokenizerFactory.fromConfig(config: $0) }
130111
}
131112

132-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
113+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
133114
preTokenizers.reduce([text]) { current, preTokenizer in
134115
preTokenizer(texts: current, options: options)
135116
}
@@ -143,7 +124,7 @@ class WhitespacePreTokenizer: PreTokenizer {
143124
re = #"\S+"#
144125
}
145126

146-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
127+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
147128
text.ranges(of: re).map { String(text[$0]) }
148129
}
149130
}
@@ -183,7 +164,7 @@ class MetaspacePreTokenizer: PreTokenizer {
183164

184165
/// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
185166
/// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
186-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
167+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
187168
let normalized = text.replacingOccurrences(of: " ", with: stringReplacement)
188169

189170
// We add a prefix space if:
@@ -222,7 +203,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
222203
useRegex = config.useRegex.boolean(or: true)
223204
}
224205

225-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
206+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
226207
// Split on whitespace and punctuation
227208
let tokens = useRegex ? text.ranges(of: RE).map { String(text[$0]) } : [text]
228209
return tokens.map { token in
@@ -243,7 +224,7 @@ class PunctuationPreTokenizer: PreTokenizer {
243224
re = "[^\(punctuationRegex)]+|[\(punctuationRegex)]+"
244225
}
245226

246-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
227+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
247228
// Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
248229
text.ranges(of: re).map { String(text[$0]) }
249230
}
@@ -257,7 +238,7 @@ class DigitsPreTokenizer: PreTokenizer {
257238
re = "[^\\d]+|\\d\(individualDigits ? "" : "+")"
258239
}
259240

260-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
241+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
261242
text.ranges(of: re).map { String(text[$0]) }
262243
}
263244
}
@@ -271,7 +252,7 @@ class SplitPreTokenizer: PreTokenizer {
271252
invert = config.invert.boolean(or: false)
272253
}
273254

274-
func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
255+
func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
275256
guard let pattern else { return [text] }
276257
return pattern.split(text, invert: invert)
277258
}

0 commit comments

Comments
 (0)