Fix typing with onnxruntime 1.22

istupakov · istupakov · commit 52280315b789 · 2025-05-10T04:41:47.000Z
diff --git a/src/onnx_asr/models/gigaam.py b/src/onnx_asr/models/gigaam.py
@@ -7,7 +7,7 @@
 import onnxruntime as rt
 
 from onnx_asr.asr import _AsrWithCtcDecoding, _AsrWithDecoding, _AsrWithTransducerDecoding
-from onnx_asr.utils import OnnxSessionOptions
+from onnx_asr.utils import OnnxSessionOptions, is_float32_array, is_int32_array
 
 
 class _GigaamV2(_AsrWithDecoding):
@@ -48,6 +48,7 @@ def _encode(
         self, features: npt.NDArray[np.float32], features_lens: npt.NDArray[np.int64]
     ) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.int64]]:
         (log_probs,) = self._model.run(["log_probs"], {"features": features, "feature_lengths": features_lens})
+        assert is_float32_array(log_probs)
         return log_probs, (features_lens - 1) // self._subsampling_factor + 1
 
 
@@ -91,6 +92,7 @@ def _encode(
         encoder_out, encoder_out_lens = self._encoder.run(
             ["encoded", "encoded_len"], {"audio_signal": features, "length": features_lens}
         )
+        assert is_float32_array(encoder_out) and is_int32_array(encoder_out_lens)
         return encoder_out, encoder_out_lens.astype(np.int64)
 
     def _create_state(self) -> _STATE_TYPE:
@@ -102,8 +104,10 @@ def _create_state(self) -> _STATE_TYPE:
     def _decode(
         self, prev_tokens: list[int], prev_state: _STATE_TYPE, encoder_out: npt.NDArray[np.float32]
     ) -> tuple[npt.NDArray[np.float32], int, _STATE_TYPE]:
-        decoder_out, *state = self._decoder.run(
+        decoder_out, state1, state2 = self._decoder.run(
             ["dec", "h", "c"], {"x": [[[self._blank_idx, *prev_tokens][-1]]], "h.1": prev_state[0], "c.1": prev_state[1]}
         )
+        assert is_float32_array(decoder_out) and is_float32_array(state1) and is_float32_array(state2)
         (joint,) = self._joiner.run(["joint"], {"enc": encoder_out[None, :, None], "dec": decoder_out.transpose(0, 2, 1)})
-        return np.squeeze(joint), -1, tuple(state)
+        assert is_float32_array(joint)
+        return np.squeeze(joint), -1, (state1, state2)
diff --git a/src/onnx_asr/models/kaldi.py b/src/onnx_asr/models/kaldi.py
@@ -7,7 +7,7 @@
 import onnxruntime as rt
 
 from onnx_asr.asr import _AsrWithTransducerDecoding
-from onnx_asr.utils import OnnxSessionOptions
+from onnx_asr.utils import OnnxSessionOptions, is_float32_array, is_int64_array
 
 _STATE_TYPE = dict[tuple[int, ...], npt.NDArray[np.float32]]
 
@@ -59,6 +59,7 @@ def _encode(
         encoder_out, encoder_out_lens = self._encoder.run(
             ["encoder_out", "encoder_out_lens"], {"x": features, "x_lens": features_lens}
         )
+        assert is_float32_array(encoder_out) and is_int64_array(encoder_out_lens)
         return encoder_out.transpose(0, 2, 1), encoder_out_lens
 
     def _create_state(self) -> _STATE_TYPE:
@@ -68,7 +69,9 @@ def _decode(
         self, prev_tokens: list[int], prev_state: _STATE_TYPE, encoder_out: npt.NDArray[np.float32]
     ) -> tuple[npt.NDArray[np.float32], int, _STATE_TYPE]:
         (decoder_out,) = self._decoder.run(["decoder_out"], {"y": [[-1, self._blank_idx, *prev_tokens][-self.CONTEXT_SIZE :]]})
+        assert is_float32_array(decoder_out)
         (logit,) = self._joiner.run(["logit"], {"encoder_out": encoder_out[None, :], "decoder_out": decoder_out})
+        assert is_float32_array(logit)
         return np.squeeze(logit), -1, prev_state
 
 
@@ -82,8 +85,10 @@ def _decode(
 
         decoder_out = prev_state.get(context)
         if decoder_out is None:
-            (decoder_out,) = self._decoder.run(["decoder_out"], {"y": [context]})
-            prev_state[context] = decoder_out
+            (_decoder_out,) = self._decoder.run(["decoder_out"], {"y": [context]})
+            assert is_float32_array(_decoder_out)
+            prev_state[context] = (decoder_out := _decoder_out)
 
         (logit,) = self._joiner.run(["logit"], {"encoder_out": encoder_out[None, :], "decoder_out": decoder_out})
+        assert is_float32_array(logit)
         return np.squeeze(logit), -1, prev_state
diff --git a/src/onnx_asr/models/nemo.py b/src/onnx_asr/models/nemo.py
@@ -7,7 +7,7 @@
 import onnxruntime as rt
 
 from onnx_asr.asr import _AsrWithCtcDecoding, _AsrWithDecoding, _AsrWithTransducerDecoding
-from onnx_asr.utils import OnnxSessionOptions
+from onnx_asr.utils import OnnxSessionOptions, is_float32_array
 
 
 class _NemoConformer(_AsrWithDecoding):
@@ -47,6 +47,7 @@ def _encode(
         self, features: npt.NDArray[np.float32], features_lens: npt.NDArray[np.int64]
     ) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.int64]]:
         (logprobs,) = self._model.run(["logprobs"], {"audio_signal": features, "length": features_lens})
+        assert is_float32_array(logprobs)
         return logprobs, (features_lens - 1) // self._subsampling_factor + 1
 
 
@@ -86,7 +87,7 @@ def _encode(
         encoder_out, encoder_out_lens = self._encoder.run(
             ["outputs", "encoded_lengths"], {"audio_signal": features, "length": features_lens}
         )
-        return encoder_out, encoder_out_lens
+        return encoder_out, encoder_out_lens  # type: ignore
 
     def _create_state(self) -> _STATE_TYPE:
         shapes = {x.name: x.shape for x in self._decoder_joint.get_inputs()}
@@ -98,7 +99,7 @@ def _create_state(self) -> _STATE_TYPE:
     def _decode(
         self, prev_tokens: list[int], prev_state: _STATE_TYPE, encoder_out: npt.NDArray[np.float32]
     ) -> tuple[npt.NDArray[np.float32], int, _STATE_TYPE]:
-        outputs, *state = self._decoder_joint.run(
+        outputs, state1, state2 = self._decoder_joint.run(
             ["outputs", "output_states_1", "output_states_2"],
             {
                 "encoder_outputs": encoder_out[None, :, None],
@@ -108,7 +109,8 @@ def _decode(
                 "input_states_2": prev_state[1],
             },
         )
-        return np.squeeze(outputs), -1, tuple(state)
+        assert is_float32_array(outputs) and is_float32_array(state1) and is_float32_array(state2)
+        return np.squeeze(outputs), -1, (state1, state2)
 
 
 class NemoConformerTdt(NemoConformerRnnt):
diff --git a/src/onnx_asr/models/pyannote.py b/src/onnx_asr/models/pyannote.py
@@ -1,13 +1,12 @@
 """PyAnnote VAD implementation."""
 
-import typing
 from pathlib import Path
 
 import numpy as np
 import numpy.typing as npt
 import onnxruntime as rt
 
-from onnx_asr.utils import OnnxSessionOptions
+from onnx_asr.utils import OnnxSessionOptions, is_float32_array
 from onnx_asr.vad import Vad
 
 
@@ -31,4 +30,5 @@ def _get_model_files(quantization: str | None = None) -> dict[str, str]:
 
     def _encode(self, waveforms: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
         (logits,) = self._model.run(["logits"], {"input_values": waveforms[:, None]})
-        return typing.cast(npt.NDArray[np.float32], logits)
+        assert is_float32_array(logits)
+        return logits
diff --git a/src/onnx_asr/models/silero.py b/src/onnx_asr/models/silero.py
@@ -1,6 +1,5 @@
 """Silero VAD implementation."""
 
-import typing
 from collections.abc import Iterable, Iterator
 from itertools import chain
 from pathlib import Path
@@ -9,7 +8,7 @@
 import numpy.typing as npt
 import onnxruntime as rt
 
-from onnx_asr.utils import OnnxSessionOptions
+from onnx_asr.utils import OnnxSessionOptions, is_float32_array
 from onnx_asr.vad import Vad
 
 
@@ -44,8 +43,10 @@ def _encode(self, waveforms: npt.NDArray[np.float32]) -> Iterator[npt.NDArray[np
 
         def process(frame: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
             nonlocal state
-            output, state = self._model.run(["output", "stateN"], {"input": frame, "state": state, "sr": [self.SAMPLE_RATE]})
-            return typing.cast(npt.NDArray[np.float32], output[:, 0])
+            output, new_state = self._model.run(["output", "stateN"], {"input": frame, "state": state, "sr": [self.SAMPLE_RATE]})
+            assert is_float32_array(output) and is_float32_array(new_state)
+            state = new_state
+            return output[:, 0]
 
         yield process(np.pad(waveforms[:, : self.HOP_SIZE], ((0, 0), (self.CONTEXT_SIZE, 0))))
 
diff --git a/src/onnx_asr/models/whisper.py b/src/onnx_asr/models/whisper.py
@@ -11,7 +11,7 @@
 import onnxruntime as rt
 
 from onnx_asr.asr import Asr, TimestampedResult
-from onnx_asr.utils import OnnxSessionOptions
+from onnx_asr.utils import OnnxSessionOptions, is_float32_array, is_int32_array
 
 
 @typing.no_type_check
@@ -129,7 +129,8 @@ def _decoding(
                 "decoder_input_ids": tokens.astype(np.int32),
             },
         )
-        return typing.cast(npt.NDArray[np.int32], sequences)[:, 0, :].astype(np.int64)
+        assert is_int32_array(sequences)
+        return sequences[:, 0, :].astype(np.int64)
 
 
 class WhisperHf(_Whisper):
@@ -162,11 +163,13 @@ def _preprocessor_name(self) -> str:
     def _encode(self, waveforms: npt.NDArray[np.float32], waveforms_len: npt.NDArray[np.int64]) -> npt.NDArray[np.float32]:
         input_features = super()._encode(waveforms, waveforms_len)
         (last_hidden_state,) = self._encoder.run(["last_hidden_state"], {"input_features": input_features})
-        return typing.cast(npt.NDArray[np.float32], last_hidden_state)
+        assert is_float32_array(last_hidden_state)
+        return last_hidden_state
 
     def _decode(self, tokens: npt.NDArray[np.int64], encoder_out: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
         (logits,) = self._decoder.run(["logits"], {"input_ids": tokens, "encoder_hidden_states": encoder_out})
-        return typing.cast(npt.NDArray[np.float32], logits)
+        assert is_float32_array(logits)
+        return logits
 
     def _decoding(
         self, input_features: npt.NDArray[np.float32], tokens: npt.NDArray[np.int64], max_length: int = 448
diff --git a/src/onnx_asr/preprocessors/preprocessor.py b/src/onnx_asr/preprocessors/preprocessor.py
@@ -7,7 +7,7 @@
 import numpy.typing as npt
 import onnxruntime as rt
 
-from onnx_asr.utils import OnnxSessionOptions
+from onnx_asr.utils import OnnxSessionOptions, is_float32_array, is_int64_array
 
 
 class Preprocessor:
@@ -33,4 +33,5 @@ def __call__(
         features, features_lens = self._preprocessor.run(
             ["features", "features_lens"], {"waveforms": waveforms, "waveforms_lens": waveforms_lens}
         )
+        assert is_float32_array(features) and is_int64_array(features_lens)
         return features, features_lens
diff --git a/src/onnx_asr/preprocessors/resampler.py b/src/onnx_asr/preprocessors/resampler.py
@@ -6,7 +6,7 @@
 import numpy.typing as npt
 import onnxruntime as rt
 
-from onnx_asr.utils import OnnxSessionOptions, SampleRates
+from onnx_asr.utils import OnnxSessionOptions, SampleRates, is_float32_array, is_int64_array
 
 
 class Resampler:
@@ -27,9 +27,12 @@ def __call__(
         self, waveforms: npt.NDArray[np.float32], waveforms_lens: npt.NDArray[np.int64], sample_rate: SampleRates
     ) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.int64]]:
         """Resample waveform to 16 kHz."""
-        if sample_rate != 16_000:
-            waveforms, waveforms_lens = self._preprocessor.run(
-                ["resampled", "resampled_lens"],
-                {"waveforms": waveforms, "waveforms_lens": waveforms_lens, "sample_rate": [sample_rate]},
-            )
-        return waveforms, waveforms_lens
+        if sample_rate == 16_000:
+            return waveforms, waveforms_lens
+
+        resampled, resampled_lens = self._preprocessor.run(
+            ["resampled", "resampled_lens"],
+            {"waveforms": waveforms, "waveforms_lens": waveforms_lens, "sample_rate": [sample_rate]},
+        )
+        assert is_float32_array(resampled) and is_int64_array(resampled_lens)
+        return resampled, resampled_lens
diff --git a/src/onnx_asr/utils.py b/src/onnx_asr/utils.py
@@ -16,6 +16,21 @@ def is_supported_sample_rate(sample_rate: int) -> TypeGuard[SampleRates]:
     return sample_rate in get_args(SampleRates)
 
 
+def is_float32_array(x: object) -> TypeGuard[npt.NDArray[np.float32]]:
+    """Numpy array is float32."""
+    return isinstance(x, np.ndarray) and x.dtype == np.float32
+
+
+def is_int32_array(x: object) -> TypeGuard[npt.NDArray[np.int32]]:
+    """Numpy array is int32."""
+    return isinstance(x, np.ndarray) and x.dtype == np.int32
+
+
+def is_int64_array(x: object) -> TypeGuard[npt.NDArray[np.int64]]:
+    """Numpy array is int64."""
+    return isinstance(x, np.ndarray) and x.dtype == np.int64
+
+
 class SupportedOnlyMonoAudioError(ValueError):
     """Supported only mono audio error."""