lazily load torch in coreml parser

mittagessen · mittagessen · commit aeeecef902b5 · 2025-11-06T14:46:33.000+01:00
diff --git a/kraken/models/_coreml.py b/kraken/models/_coreml.py
@@ -0,0 +1,108 @@
+"""
+kraken.models.loaders
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Implementation for model metadata and weight loading from various formats.
+"""
+import torch
+
+
+def _coreml_lin(spec):
+    weights = {}
+    for layer in spec:
+        if layer.WhichOneof('layer') == 'innerProduct':
+            name = layer.name.removesuffix('_lin')
+            lin = layer.innerProduct
+            weights[f'nn.{name}.lin.weight'] = torch.Tensor(lin.weights.floatValue).view(lin.outputChannels, lin.inputChannels)
+            weights[f'nn.{name}.lin.bias'] = torch.Tensor(lin.bias.floatValue)
+    return weights
+
+
+def _coreml_rnn(spec):
+    weights = {}
+    for layer in spec:
+        if (arch := layer.WhichOneof('layer')) in ['uniDirectionalLSTM', 'biDirectionalLSTM']:
+            rnn = getattr(layer, arch)
+            output_size = rnn.outputVectorSize
+            input_size = rnn.inputVectorSize
+            name = layer.name.removesuffix('_transposed')
+
+            def _deserialize_weights(params, direction):
+                # ih_matrix
+                weight_ih = torch.Tensor([params.inputGateWeightMatrix.floatValue,  # wi
+                                          params.forgetGateWeightMatrix.floatValue,  # wf
+                                          params.blockInputWeightMatrix.floatValue,  # wz/wg
+                                          params.outputGateWeightMatrix.floatValue])  # wo
+                # hh_matrix
+                weight_hh = torch.Tensor([params.inputGateRecursionMatrix.floatValue,  # wi
+                                          params.forgetGateRecursionMatrix.floatValue,  # wf
+                                          params.blockInputRecursionMatrix.floatValue,  # wz/wg
+                                          params.outputGateRecursionMatrix.floatValue])  # wo
+                weights[f'nn.{name}.layer.weight_ih_l0{"_reverse" if direction == "bwd" else ""}'] = weight_ih.view(-1, input_size)
+                weights[f'nn.{name}.layer.weight_hh_l0{"_reverse" if direction == "bwd" else ""}'] = weight_hh.view(-1, output_size)
+                biases = torch.Tensor([params.inputGateBiasVector.floatValue,  # bi
+                                       params.forgetGateBiasVector.floatValue,  # bf
+                                       params.blockInputBiasVector.floatValue,  # bz/bg
+                                       params.outputGateBiasVector.floatValue]).view(-1)  # bo
+                weights[f'nn.{name}.layer.bias_hh_l0{"_reverse" if direction == "bwd" else ""}'] = biases
+                # no ih_biases
+                weights[f'nn.{name}.layer.bias_ih_l0{"_reverse" if direction == "bwd" else ""}'] = torch.zeros_like(biases)
+
+            fwd_params = rnn.weightParams if arch == 'uniDirectionalLSTM' else rnn.weightParams[0]
+            _deserialize_weights(fwd_params, 'fwd')
+
+            # get backward weights
+            if arch == 'biDirectionalLSTM':
+                _deserialize_weights(rnn.weightParams[1], 'bwd')
+    return weights
+
+
+def _coreml_conv(spec):
+    weights = {}
+    for layer in spec:
+        if layer.WhichOneof('layer') == 'convolution':
+            name = layer.name.removesuffix('_conv')
+            conv = layer.convolution
+            in_channels = conv.kernelChannels
+            out_channels = conv.outputChannels
+            kernel_size = conv.kernelSize
+            if conv.isDeconvolution:
+                weights[f'nn.{name}.co.weight'] = torch.Tensor(conv.weights.floatValue).view(in_channels, out_channels, *kernel_size)
+            else:
+                weights[f'nn.{name}.co.weight'] = torch.Tensor(conv.weights.floatValue).view(out_channels, in_channels, *kernel_size)
+            weights[f'nn.{name}.co.bias'] = torch.Tensor(conv.bias.floatValue)
+    return weights
+
+
+def _coreml_groupnorm(spec):
+    weights = {}
+    for layer in spec:
+        if layer.WhichOneof('layer') == 'custom' and layer.custom.className == 'groupnorm':
+            gn = layer.custom
+            in_channels = gn.parameters['in_channels'].intValue
+            weights[f'nn.{layer.name}.layer.weight'] = torch.Tensor(gn.weights[0].floatValue).view(in_channels)
+            weights[f'nn.{layer.name}.layer.bias'] = torch.Tensor(gn.weights[1].floatValue).view(in_channels)
+    return weights
+
+
+def _coreml_romlp(spec):
+    weights = {}
+    return weights
+
+
+def _coreml_wav2vec2mask(spec):
+    weights = {}
+    # extract embedding parameters
+    if len(emb := [x for x in spec if x.name.endswith('_wave2vec2_emb')]):
+        emb = emb[0].embedding
+        weights['nn._wave2vec2mask.mask_emb.weight'] = torch.Tensor(emb.weights.floatValue)
+    # extract linear projection parameters
+    if len(lin := [x for x in spec if x.name.endswith('_wave2vec2_lin')]):
+        lin = lin[0].innerProduct
+        weights['nn._wave2vec2mask.project_q.weight'] = torch.Tensor(lin.weights.floatValue).view(lin.outputChannels, lin.inputChannels)
+        weights['nn._wave2vec2mask.project_q.bias'] = torch.Tensor(lin.bias.floatValue)
+    return weights
+
+
+_coreml_parsers = [_coreml_conv, _coreml_rnn, _coreml_lin, _coreml_groupnorm,
+                   _coreml_wav2vec2mask, _coreml_romlp]
diff --git a/kraken/models/configs.py b/kraken/models/configs.py
@@ -270,6 +270,8 @@ class TrainingConfig(Config):
             Evaluation and checkpoint saving frequency
         checkpoint_path (PathLike, defaults to `model`):
             Path prefix to save checkpoints during training.
+        weights_format (Literal[safetensors, coreml], defaults to 'safetensors'):
+            Weight format to convert checkpoint at end of training to.
 
         > Optimizer configuration
 
@@ -318,6 +320,7 @@ def __init__(self, **kwargs):
         self.completed_epochs = kwargs.pop('completed_epochs', 0)
         self.freq = kwargs.pop('freq', 1.0)
         self.checkpoint_path = kwargs.pop('checkpoint_path', 'model')
+        self.weights_format = kwargs.pop('weights_format', 'safetensors')
         self.optimizer = kwargs.pop('optimizer', 'AdamW')
         self.lrate = kwargs.pop('lrate', 1e-5)
         self.momentum = kwargs.pop('momentum', 0.9)
diff --git a/kraken/models/loaders.py b/kraken/models/loaders.py
@@ -5,7 +5,6 @@
 Implementation for model metadata and weight loading from various formats.
 """
 import json
-import torch
 import logging
 import importlib
 
@@ -27,108 +26,6 @@
 __all__ = ['load_models', 'load_coreml', 'load_safetensors']
 
 
-# deserializers for coreml layers with weights
-def _coreml_lin(spec):
-    weights = {}
-    for layer in spec:
-        if layer.WhichOneof('layer') == 'innerProduct':
-            name = layer.name.removesuffix('_lin')
-            lin = layer.innerProduct
-            weights[f'nn.{name}.lin.weight'] = torch.Tensor(lin.weights.floatValue).view(lin.outputChannels, lin.inputChannels)
-            weights[f'nn.{name}.lin.bias'] = torch.Tensor(lin.bias.floatValue)
-    return weights
-
-
-def _coreml_rnn(spec):
-    weights = {}
-    for layer in spec:
-        if (arch := layer.WhichOneof('layer')) in ['uniDirectionalLSTM', 'biDirectionalLSTM']:
-            rnn = getattr(layer, arch)
-            output_size = rnn.outputVectorSize
-            input_size = rnn.inputVectorSize
-            name = layer.name.removesuffix('_transposed')
-
-            def _deserialize_weights(params, direction):
-                # ih_matrix
-                weight_ih = torch.Tensor([params.inputGateWeightMatrix.floatValue,  # wi
-                                          params.forgetGateWeightMatrix.floatValue,  # wf
-                                          params.blockInputWeightMatrix.floatValue,  # wz/wg
-                                          params.outputGateWeightMatrix.floatValue])  # wo
-                # hh_matrix
-                weight_hh = torch.Tensor([params.inputGateRecursionMatrix.floatValue,  # wi
-                                          params.forgetGateRecursionMatrix.floatValue,  # wf
-                                          params.blockInputRecursionMatrix.floatValue,  # wz/wg
-                                          params.outputGateRecursionMatrix.floatValue])  # wo
-                weights[f'nn.{name}.layer.weight_ih_l0{"_reverse" if direction == "bwd" else ""}'] = weight_ih.view(-1, input_size)
-                weights[f'nn.{name}.layer.weight_hh_l0{"_reverse" if direction == "bwd" else ""}'] = weight_hh.view(-1, output_size)
-                biases = torch.Tensor([params.inputGateBiasVector.floatValue,  # bi
-                                       params.forgetGateBiasVector.floatValue,  # bf
-                                       params.blockInputBiasVector.floatValue,  # bz/bg
-                                       params.outputGateBiasVector.floatValue]).view(-1)  # bo
-                weights[f'nn.{name}.layer.bias_hh_l0{"_reverse" if direction == "bwd" else ""}'] = biases
-                # no ih_biases
-                weights[f'nn.{name}.layer.bias_ih_l0{"_reverse" if direction == "bwd" else ""}'] = torch.zeros_like(biases)
-
-            fwd_params = rnn.weightParams if arch == 'uniDirectionalLSTM' else rnn.weightParams[0]
-            _deserialize_weights(fwd_params, 'fwd')
-
-            # get backward weights
-            if arch == 'biDirectionalLSTM':
-                _deserialize_weights(rnn.weightParams[1], 'bwd')
-    return weights
-
-
-def _coreml_conv(spec):
-    weights = {}
-    for layer in spec:
-        if layer.WhichOneof('layer') == 'convolution':
-            name = layer.name.removesuffix('_conv')
-            conv = layer.convolution
-            in_channels = conv.kernelChannels
-            out_channels = conv.outputChannels
-            kernel_size = conv.kernelSize
-            if conv.isDeconvolution:
-                weights[f'nn.{name}.co.weight'] = torch.Tensor(conv.weights.floatValue).view(in_channels, out_channels, *kernel_size)
-            else:
-                weights[f'nn.{name}.co.weight'] = torch.Tensor(conv.weights.floatValue).view(out_channels, in_channels, *kernel_size)
-            weights[f'nn.{name}.co.bias'] = torch.Tensor(conv.bias.floatValue)
-    return weights
-
-
-def _coreml_groupnorm(spec):
-    weights = {}
-    for layer in spec:
-        if layer.WhichOneof('layer') == 'custom' and layer.custom.className == 'groupnorm':
-            gn = layer.custom
-            in_channels = gn.parameters['in_channels'].intValue
-            weights[f'nn.{layer.name}.layer.weight'] = torch.Tensor(gn.weights[0].floatValue).view(in_channels)
-            weights[f'nn.{layer.name}.layer.bias'] = torch.Tensor(gn.weights[1].floatValue).view(in_channels)
-    return weights
-
-
-def _coreml_romlp(spec):
-    weights = {}
-    return weights
-
-
-def _coreml_wav2vec2mask(spec):
-    weights = {}
-    # extract embedding parameters
-    if len(emb := [x for x in spec if x.name.endswith('_wave2vec2_emb')]):
-        emb = emb[0].embedding
-        weights['nn._wave2vec2mask.mask_emb.weight'] = torch.Tensor(emb.weights.floatValue)
-    # extract linear projection parameters
-    if len(lin := [x for x in spec if x.name.endswith('_wave2vec2_lin')]):
-        lin = lin[0].innerProduct
-        weights['nn._wave2vec2mask.project_q.weight'] = torch.Tensor(lin.weights.floatValue).view(lin.outputChannels, lin.inputChannels)
-        weights['nn._wave2vec2mask.project_q.bias'] = torch.Tensor(lin.bias.floatValue)
-    return weights
-
-
-_coreml_parsers = [_coreml_conv, _coreml_rnn, _coreml_lin, _coreml_groupnorm,
-                   _coreml_wav2vec2mask, _coreml_romlp]
-
-
 def load_models(path: Union[str, 'PathLike'], tasks: Optional[Sequence[_T_tasks]] = None) -> list[BaseModel]:
     """
     Tries all loaders in sequence to deserialize models found in file at path.
@@ -218,16 +115,17 @@ def load_coreml(path: Union[str, PathLike], tasks: Optional[Sequence[_T_tasks]]
     # construct state dict
     weights = {}
     spec = mlmodel.get_spec().neuralNetwork.layers
+    from ._coreml import _coreml_parsers
     for cml_parser in _coreml_parsers:
         weights.update(cml_parser(spec))
 
     model.load_state_dict(weights)
 
     # construct additional models if auxiliary layers are defined.
 
-    #if 'aux_layers' in mlmodel.user_defined_metadata:
-    #    logger.info('Deserializing auxiliary layers.')
+    # if 'aux_layers' in mlmodel.user_defined_metadata:
+    #     logger.info('Deserializing auxiliary layers.')
 
-    #    nn.aux_layers = {k: cls(v).nn.get_submodule(k) for k, v in json.loads(mlmodel.user_defined_metadata['aux_layers']).items()}
+    #     nn.aux_layers = {k: cls(v).nn.get_submodule(k) for k, v in json.loads(mlmodel.user_defined_metadata['aux_layers']).items()}
 
     return [model]