add audio_metrics module and snr to metrax

jshin1394 · jshin1394 · commit d63cd215e92a · 2025-05-28T20:01:59.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ pytest
 rouge-score
 scikit-learn
 tensorflow
+torchmetrics
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from metrax import audio_metrics
 from metrax import base
 from metrax import classification_metrics
 from metrax import image_metrics
@@ -42,6 +43,7 @@
 RecallAtK = ranking_metrics.RecallAtK
 RougeL = nlp_metrics.RougeL
 RougeN = nlp_metrics.RougeN
+SNR = audio_metrics.SNR
 SSIM = image_metrics.SSIM
 WER = nlp_metrics.WER
 
@@ -70,6 +72,7 @@
     "RecallAtK",
     "RougeL",
     "RougeN",
+    "SNR",
     "SSIM",
     "WER",
 ]
diff --git a/src/metrax/audio_metrics.py b/src/metrax/audio_metrics.py
@@ -0,0 +1,116 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A collection of different metrics for audio models."""
+
+import flax
+import jax
+import jax.numpy as jnp
+from metrax import base
+
+
+@flax.struct.dataclass
+class SNR(base.Average):
+  r"""SNR (Signal-to-Noise Ratio) Metric for audio.
+
+  This class calculates the Signal-to-Noise Ratio (SNR) in decibels (dB)
+  between a predicted audio signal and a ground truth audio signal,
+  and averages it over a dataset.
+
+  The SNR is defined as:
+
+  .. math::
+
+      SNR_{dB} = 10 \\cdot \\log_{10} \\left( \\frac{P_{signal}}{P_{noise}}
+      \\right)
+
+  Where:
+    - :math:`P_{signal}` is the power of the ground truth signal (`target`).
+      By default (`zero_mean=False`), this is the mean of the squared `target`
+      values.
+      If `zero_mean=True`, it's the variance of the `target` values.
+    - :math:`P_{noise}` is the power of the noise component, which is defined as
+      the difference between the `target` and `preds` (`target - preds`).
+      By default (`zero_mean=False`), this is the mean of the squared noise
+      values.
+      If `zero_mean=True`, it's the variance of the noise values.
+  """
+
+  @staticmethod
+  def _calculate_snr(
+      preds: jax.Array,
+      target: jax.Array,
+      zero_mean: bool = False,
+  ) -> jax.Array:
+    """Computes SNR (Signal-to-Noise Ratio) values for a batch of audio signals.
+
+    Args:
+        preds: The estimated or predicted audio signal. JAX Array.
+        target: The ground truth audio signal. JAX Array.
+        zero_mean: If True, subtracts the mean from the signal and noise before
+          calculating their respective powers. Defaults to False.
+
+    Returns:
+        A 1D JAX array representing the SNR in decibels (dB) for each example
+        in the batch.
+    """
+    if preds.shape != target.shape:
+      raise ValueError(
+          f'Input signals must have the same shape, but got {preds.shape} and'
+          f' {target.shape}'
+      )
+
+    target_processed, preds_processed = jax.lax.cond(
+        zero_mean,
+        lambda t, p: (
+            t - jnp.mean(t, axis=-1, keepdims=True),
+            p - jnp.mean(p, axis=-1, keepdims=True),
+        ),
+        lambda t, p: (t, p),
+        target,
+        preds,
+    )
+    noise = target_processed - preds_processed
+    eps = jnp.finfo(preds.dtype).eps
+    signal_power = jnp.sum(target_processed**2, axis=-1) + eps
+    noise_power = jnp.sum(noise**2, axis=-1) + eps
+
+    snr = 10 * jnp.log10(base.divide_no_nan(signal_power, noise_power))
+    return snr
+
+  @classmethod
+  def from_model_output(
+      cls,
+      predictions: jax.Array,
+      targets: jax.Array,
+      zero_mean: bool = False,
+  ) -> 'SNR':
+    """Computes SNR for a batch of audio signals and creates an SNR metric instance.
+
+    Args:
+        predictions: A JAX array of predicted audio signals.
+        targets: A JAX array of ground truth audio signals.
+        zero_mean: If True, subtracts the mean from the signal and noise before
+          calculating their respective powers.
+
+    Returns:
+        An SNR instance containing the SNR value for the current batch,
+        ready for averaging.
+    """
+    batch_snr_value = cls._calculate_snr(
+        predictions,
+        targets,
+        zero_mean=zero_mean,
+    )
+    return super().from_model_output(values=batch_snr_value)
diff --git a/src/metrax/audio_metrics_test.py b/src/metrax/audio_metrics_test.py
@@ -0,0 +1,137 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for metrax image metrics."""
+
+import os
+
+os.environ['KERAS_BACKEND'] = 'jax'
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import jax.numpy as jnp
+import metrax
+import numpy as np
+import torch
+from torchmetrics.functional.audio import snr as tm_snr
+
+np.random.seed(42)
+
+# Simple 1D audio signal.
+AUDIO_SHAPE_1D = (1000,)
+AUDIO_TARGET_1D = np.sin(
+    np.linspace(0, 2 * np.pi * 5, AUDIO_SHAPE_1D[0])
+).astype(np.float32)
+AUDIO_PREDS_1D_NOISY = (
+    AUDIO_TARGET_1D + 0.1 * np.random.randn(*AUDIO_SHAPE_1D)
+).astype(np.float32)
+AUDIO_PREDS_1D_PERFECT = AUDIO_TARGET_1D
+# Multi-dimensional batch of signals
+AUDIO_SHAPE_2D = (4, 500)  # This is likely the source of the 4 elements.
+AUDIO_TARGET_2D = (np.random.randn(*AUDIO_SHAPE_2D) * 5.0).astype(np.float32)
+AUDIO_PREDS_2D_NOISY = (
+    AUDIO_TARGET_2D + 0.5 * np.random.randn(*AUDIO_SHAPE_2D)
+).astype(np.float32)
+# Target and preds are all zeros.
+AUDIO_SHAPE_ZEROS = (100,)
+AUDIO_TARGET_ZEROS = np.zeros(AUDIO_SHAPE_ZEROS).astype(np.float32)
+AUDIO_PREDS_ZEROS = np.zeros(AUDIO_SHAPE_ZEROS).astype(np.float32)
+
+
+class AudioMetricsTest(parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      (
+          'snr_1d_noisy_false_zero_mean',
+          AUDIO_TARGET_1D,
+          AUDIO_PREDS_1D_NOISY,
+          False,
+      ),
+      (
+          'snr_1d_noisy_true_zero_mean',
+          AUDIO_TARGET_1D,
+          AUDIO_PREDS_1D_NOISY,
+          True,
+      ),
+      (
+          'snr_1d_perfect_false_zero_mean',
+          AUDIO_TARGET_1D,
+          AUDIO_PREDS_1D_PERFECT,
+          False,
+      ),
+      (
+          'snr_1d_perfect_true_zero_mean',
+          AUDIO_TARGET_1D,
+          AUDIO_PREDS_1D_PERFECT,
+          True,
+      ),
+      (
+          'snr_2d_noisy_false_zero_mean',
+          AUDIO_TARGET_2D,
+          AUDIO_PREDS_2D_NOISY,
+          False,
+      ),
+      (
+          'snr_2d_noisy_true_zero_mean',
+          AUDIO_TARGET_2D,
+          AUDIO_PREDS_2D_NOISY,
+          True,
+      ),
+      (
+          'snr_zeros_false_zero_mean',
+          AUDIO_TARGET_ZEROS,
+          AUDIO_PREDS_ZEROS,
+          False,
+      ),
+      ('snr_zeros_true_zero_mean', AUDIO_TARGET_ZEROS, AUDIO_PREDS_ZEROS, True),
+  )
+  def test_snr(
+      self,
+      target_np: np.ndarray,
+      preds_np: np.ndarray,
+      zero_mean: bool,
+  ):
+    """Tests metrax.SNR against torchmetrics.functional.audio.snr."""
+    metrax_snr_metric = metrax.SNR.from_model_output(
+        predictions=jnp.array(preds_np),
+        targets=jnp.array(target_np),
+        zero_mean=zero_mean,
+    )
+    metrax_snr_result = metrax_snr_metric.compute()
+
+    torchmetrics_snr_result = (
+        tm_snr.signal_noise_ratio(
+            preds=torch.from_numpy(preds_np),
+            target=torch.from_numpy(target_np),
+            zero_mean=zero_mean,
+        )
+        .mean()
+        .item()
+    )
+
+    np.testing.assert_allclose(
+        metrax_snr_result,
+        torchmetrics_snr_result,
+        rtol=1e-5,
+        atol=1e-5,
+        err_msg=(
+            f'SNR mismatch for zero_mean={zero_mean}.\n'
+            f'Metrax SNR: {metrax_snr_result:.8f} dB, '
+            f'Torchmetrics SNR: {torchmetrics_snr_result:.8f} dB'
+        ),
+    )
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/src/metrax/metrax_test.py b/src/metrax/metrax_test.py
@@ -55,7 +55,10 @@
     np.int32
 )
 IOU_TARGET_CLASS_IDS = np.array([0, 1])
-
+# For audio_metrics.
+AUDIO_SHAPE = (2, 16000)
+AUDIO_PREDS = np.random.randn(*AUDIO_SHAPE).astype(np.float32)
+AUDIO_TARGETS = np.random.randn(*AUDIO_SHAPE).astype(np.float32)
 
 class MetraxTest(parameterized.TestCase):
 
@@ -168,7 +171,7 @@ class MetraxTest(parameterized.TestCase):
               'targets': TARGET_IMGS,
               'max_val': MAX_IMG_VAL,
           },
-    ),
+      ),
       (
           'rmse',
           metrax.RMSE,
@@ -202,6 +205,15 @@ class MetraxTest(parameterized.TestCase):
               'max_val': MAX_IMG_VAL,
           },
       ),
+      (
+          'snr',
+          metrax.SNR,
+          {
+              'predictions': AUDIO_PREDS,
+              'targets': AUDIO_TARGETS,
+              'zero_mean': False,
+          },
+      ),
   )
   def test_metrics_jittable(self, metric, kwargs):
     """Tests that jitted metrax metric yields the same result as non-jitted metric."""
diff --git a/src/metrax/nnx/__init__.py b/src/metrax/nnx/__init__.py
@@ -37,6 +37,7 @@
 RecallAtK = nnx_metrics.RecallAtK
 RougeL = nnx_metrics.RougeL
 RougeN = nnx_metrics.RougeN
+SNR = nnx_metrics.SNR
 SSIM = nnx_metrics.SSIM
 WER = nnx_metrics.WER
 
@@ -64,6 +65,7 @@
     "RecallAtK",
     "RougeL",
     "RougeN",
+    "SNR",
     "SSIM",
     "WER",
 ]
diff --git a/src/metrax/nnx/nnx_metrics.py b/src/metrax/nnx/nnx_metrics.py
@@ -38,6 +38,7 @@ class Accuracy(NnxWrapper):
   def __init__(self):
     super().__init__(metrax.Accuracy)
 
+
 class Average(NnxWrapper):
   """An NNX class for the Metrax metric Average."""
 
@@ -178,6 +179,13 @@ def __init__(self):
     super().__init__(metrax.RSQUARED)
 
 
+class SNR(NnxWrapper):
+  """An NNX class for the Metrax metric SNR."""
+
+  def __init__(self):
+    super().__init__(metrax.SNR)
+
+
 class SSIM(NnxWrapper):
   """An NNX class for the Metrax metric SSIM."""