Clean up protocol requirements: remove redundant overloads and defaults

DePasqualeOrg · DePasqualeOrg · commit 4b94e699037e · 2026-03-04T14:55:40.000+01:00
Move convenience methods (callAsFunction, bulk wrappers, short-parameter
versions) from protocol requirements to extensions across Tokenizer,
TokenizingModel, Normalizer, Decoder, PostProcessor, and PreTokenizer.
Derive bosTokenId/eosTokenId/unknownTokenId from token strings on the
Tokenizer protocol. Remove redundant default parameter values from all
concrete implementations.
diff --git a/Sources/Tokenizers/Decoder.swift b/Sources/Tokenizers/Decoder.swift
@@ -14,12 +14,6 @@ public protocol Decoder {
     /// - Returns: An array of decoded text components
     func decode(tokens: [String]) -> [String]
 
-    /// Function call syntax for token decoding.
-    ///
-    /// - Parameter tokens: The token strings to decode
-    /// - Returns: An array of decoded text components
-    func callAsFunction(tokens: [String]) -> [String]
-
     /// Initializes the decoder from configuration.
     ///
     /// - Parameter config: The configuration for this decoder
diff --git a/Sources/Tokenizers/Normalizer.swift b/Sources/Tokenizers/Normalizer.swift
@@ -14,12 +14,6 @@ public protocol Normalizer {
     /// - Returns: The normalized text
     func normalize(text: String) -> String
 
-    /// Function call syntax for text normalization.
-    ///
-    /// - Parameter text: The input text to normalize
-    /// - Returns: The normalized text
-    func callAsFunction(text: String) -> String
-
     /// Initializes the normalizer from configuration.
     ///
     /// - Parameter config: The configuration for this normalizer
diff --git a/Sources/Tokenizers/PostProcessor.swift b/Sources/Tokenizers/PostProcessor.swift
@@ -17,15 +17,6 @@ public protocol PostProcessor {
     /// - Returns: The post-processed token sequence
     func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String]
 
-    /// Function call syntax for post-processing.
-    ///
-    /// - Parameters:
-    ///   - tokens: The primary sequence of tokens to process
-    ///   - tokensPair: An optional secondary sequence
-    ///   - addSpecialTokens: Whether to add special tokens
-    /// - Returns: The post-processed token sequence
-    func callAsFunction(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String]
-
     /// Initializes the post-processor from configuration.
     ///
     /// - Parameter config: The configuration for this post-processor
@@ -34,6 +25,11 @@ public protocol PostProcessor {
 }
 
 extension PostProcessor {
+    /// Convenience with default parameter values for the protocol requirement.
+    func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
+        postProcess(tokens: tokens, tokensPair: tokensPair, addSpecialTokens: addSpecialTokens)
+    }
+
     func callAsFunction(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
         postProcess(tokens: tokens, tokensPair: tokensPair, addSpecialTokens: addSpecialTokens)
     }
@@ -79,7 +75,7 @@ class TemplateProcessing: PostProcessor {
         self.pair = pair
     }
 
-    func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
+    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
         let config = tokensPair == nil ? single : pair
 
         var toReturn: [String] = []
@@ -100,7 +96,7 @@ class TemplateProcessing: PostProcessor {
 
 class ByteLevelPostProcessor: PostProcessor {
     required init(config: Config) {}
-    func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] { tokens }
+    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] { tokens }
 }
 
 class RobertaProcessing: PostProcessor {
@@ -124,7 +120,7 @@ class RobertaProcessing: PostProcessor {
         addPrefixSpace = config.addPrefixSpace.boolean(or: true)
     }
 
-    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool = true) -> [String] {
+    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
         var outTokens = tokens
         var tokensPair = tokensPair
         if trimOffset {
@@ -183,7 +179,7 @@ class BertProcessing: PostProcessor {
         self.cls = cls
     }
 
-    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool = true) -> [String] {
+    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
         guard addSpecialTokens else { return tokens + (tokensPair ?? []) }
 
         var outTokens = [cls.1] + tokens + [sep.1]
@@ -206,7 +202,7 @@ class SequenceProcessing: PostProcessor {
         processors = try processorConfigs.compactMap { try PostProcessorFactory.fromConfig(config: $0) }
     }
 
-    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool = true) -> [String] {
+    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String] {
         var currentTokens = tokens
         var currentTokensPair = tokensPair
 
diff --git a/Sources/Tokenizers/PreTokenizer.swift b/Sources/Tokenizers/PreTokenizer.swift
@@ -28,30 +28,6 @@ public protocol PreTokenizer {
     /// - Returns: An array of pre-tokenized text chunks
     func preTokenize(text: String, options: PreTokenizerOptions) -> [String]
 
-    /// Pre-tokenizes multiple text strings.
-    ///
-    /// - Parameters:
-    ///   - texts: The input texts to pre-tokenize
-    ///   - options: Options controlling the pre-tokenization behavior
-    /// - Returns: An array of pre-tokenized text chunks from all inputs
-    func preTokenize(texts: [String], options: PreTokenizerOptions) -> [String]
-
-    /// Function call syntax for pre-tokenizing multiple texts.
-    ///
-    /// - Parameters:
-    ///   - texts: The input texts to pre-tokenize
-    ///   - options: Options controlling the pre-tokenization behavior
-    /// - Returns: An array of pre-tokenized text chunks
-    func callAsFunction(texts: [String], options: PreTokenizerOptions) -> [String]
-
-    /// Function call syntax for pre-tokenizing a single text.
-    ///
-    /// - Parameters:
-    ///   - text: The input text to pre-tokenize
-    ///   - options: Options controlling the pre-tokenization behavior
-    /// - Returns: An array of pre-tokenized text chunks
-    func callAsFunction(text: String, options: PreTokenizerOptions) -> [String]
-
     /// Initializes the pre-tokenizer from configuration.
     ///
     /// - Parameter config: The configuration for this pre-tokenizer
@@ -60,6 +36,11 @@ public protocol PreTokenizer {
 }
 
 extension PreTokenizer {
+    /// Convenience with default parameter values for the protocol requirement.
+    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+        preTokenize(text: text, options: options)
+    }
+
     func preTokenize(texts: [String], options: PreTokenizerOptions = [.firstSection]) -> [String] {
         texts.flatMap { preTokenize(text: $0, options: options) }
     }
@@ -114,7 +95,7 @@ class BertPreTokenizer: PreTokenizer {
         re = "[^\\s\(punctuationRegex)]+|[\(punctuationRegex)]"
     }
 
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         text.ranges(of: re).map { String(text[$0]) }
     }
 }
@@ -129,7 +110,7 @@ class PreTokenizerSequence: PreTokenizer {
         preTokenizers = try configs.compactMap { try PreTokenizerFactory.fromConfig(config: $0) }
     }
 
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         preTokenizers.reduce([text]) { current, preTokenizer in
             preTokenizer(texts: current, options: options)
         }
@@ -143,7 +124,7 @@ class WhitespacePreTokenizer: PreTokenizer {
         re = #"\S+"#
     }
 
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         text.ranges(of: re).map { String(text[$0]) }
     }
 }
@@ -183,7 +164,7 @@ class MetaspacePreTokenizer: PreTokenizer {
 
     /// https://github.com/huggingface/tokenizers/blob/accd0650b802f2180df40ef1def3bce32156688e/tokenizers/src/pre_tokenizers/metaspace.rs#L114
     /// https://github.com/xenova/transformers.js/blob/b07336d8f7ff57453cc164cc68aead2a79cbd57e/src/tokenizers.js#L2153
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         let normalized = text.replacingOccurrences(of: " ", with: stringReplacement)
 
         // We add a prefix space if:
@@ -222,7 +203,7 @@ class ByteLevelPreTokenizer: PreTokenizer {
         useRegex = config.useRegex.boolean(or: true)
     }
 
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         // Split on whitespace and punctuation
         let tokens = useRegex ? text.ranges(of: RE).map { String(text[$0]) } : [text]
         return tokens.map { token in
@@ -243,7 +224,7 @@ class PunctuationPreTokenizer: PreTokenizer {
         re = "[^\(punctuationRegex)]+|[\(punctuationRegex)]+"
     }
 
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         // Ref: https://github.com/xenova/transformers.js/blob/27920d84831e323275b38f0b5186644b7936e1a2/src/tokenizers.js#L1138
         text.ranges(of: re).map { String(text[$0]) }
     }
@@ -257,7 +238,7 @@ class DigitsPreTokenizer: PreTokenizer {
         re = "[^\\d]+|\\d\(individualDigits ? "" : "+")"
     }
 
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         text.ranges(of: re).map { String(text[$0]) }
     }
 }
@@ -271,7 +252,7 @@ class SplitPreTokenizer: PreTokenizer {
         invert = config.invert.boolean(or: false)
     }
 
-    func preTokenize(text: String, options: PreTokenizerOptions = [.firstSection]) -> [String] {
+    func preTokenize(text: String, options: PreTokenizerOptions) -> [String] {
         guard let pattern else { return [text] }
         return pattern.split(text, invert: invert)
     }
diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift