Merge branch 'develop' of github.com:pyannote/pyannote-audio into develop

hbredin · hbredin · commit 5a361407a834 · 2025-12-13T15:08:25.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # CHANGELOG
 
+## Version 4.0.3 (2025-12-07)
+
+- feat(cli): add `--revision` option to most CLI commands
+- feat(util): add `Calibration.safe_transform` method (supports NaNs as well as any shape)
+- fix(model): fix `Model.from_pretrained` to support `lightning` 2.6+
+- setup: update `pyannote-database` dependency to `6.1+`
+
 ## Version 4.0.2 (2025-11-19)
 
 - BREAKING(util): make `Binarize.__call__` return `string` tracks (instead of `int`) [@benniekiss](https://github.com/benniekiss/)
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ dependencies = [
     "opentelemetry-sdk>=1.34.0",
     "opentelemetry-exporter-otlp>=1.34.0",
     "pyannote-core>=6.0.1",
-    "pyannote-database>=6.0.0",
+    "pyannote-database>=6.1.1",
     "pyannote-metrics>=4.0.0",
     "pyannote-pipeline>=4.0.0",
     "pytorch-metric-learning>=2.8.1",
diff --git a/src/pyannote/audio/__main__.py b/src/pyannote/audio/__main__.py
@@ -40,7 +40,7 @@
 import torch
 import typer
 import yaml
-from pyannote.audio import Audio, Pipeline, Model
+from pyannote.audio import Audio, Model, Pipeline
 from pyannote.core import Annotation
 from pyannote.metrics.base import BaseMetric
 from pyannote.metrics.diarization import DiarizationErrorRate, JaccardErrorRate
@@ -285,16 +285,20 @@ def download(
             help="Pretrained pipeline (e.g. pyannote/speaker-diarization-community-1)"
         ),
     ],
-    token: Annotated[
-        str,
-        typer.Argument(
-            help="Huggingface token to be used for downloading from Huggingface hub."
+    revision: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Pretrained pipeline revision.",
         ),
-    ],
+    ] = None,
+    token: Annotated[
+        Optional[str],
+        typer.Argument(help="Huggingface token."),
+    ] = None,
     cache: Annotated[
-        Path,
+        Optional[Path],
         typer.Option(
-            help="Path to the folder where files downloaded from Huggingface hub are stored.",
+            help="Path to the folder where files downloaded from Huggingface are stored.",
             exists=True,
             dir_okay=True,
             file_okay=False,
@@ -309,7 +313,7 @@ def download(
 
     # load pretrained pipeline
     pretrained_pipeline = Pipeline.from_pretrained(
-        pipeline, token=token, cache_dir=cache
+        pipeline, revision=revision, token=token, cache_dir=cache
     )
     if pretrained_pipeline is None:
         print(f"Could not load pretrained pipeline from {pipeline}.")
@@ -335,7 +339,7 @@ def apply(
         ),
     ],
     into: Annotated[
-        Path,
+        Optional[Path],
         typer.Option(
             help="Path to file or directory where results are saved.",
             exists=False,
@@ -345,27 +349,39 @@ def apply(
             resolve_path=True,
         ),
     ] = None,
-    device: Annotated[
-        Device, typer.Option(help="Accelerator to use (CPU, CUDA, MPS)")
-    ] = Device.AUTO,
+    revision: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Pretrained pipeline revision.",
+        ),
+    ] = None,
+    token: Annotated[
+        Optional[str],
+        typer.Argument(help="Huggingface token."),
+    ] = None,
     cache: Annotated[
-        Path,
+        Optional[Path],
         typer.Option(
-            help="Path to the folder where files downloaded from Huggingface hub are stored.",
+            help="Path to the folder where files downloaded from Huggingface are stored.",
             exists=True,
             dir_okay=True,
             file_okay=False,
             writable=True,
             resolve_path=True,
         ),
     ] = None,
+    device: Annotated[
+        Device, typer.Option(help="Accelerator to use (CPU, CUDA, MPS)")
+    ] = Device.AUTO,
 ):
     """
     Apply a pretrained PIPELINE to an AUDIO file or directory
     """
 
     # load pretrained pipeline
-    pretrained_pipeline = Pipeline.from_pretrained(pipeline, cache_dir=cache)
+    pretrained_pipeline = Pipeline.from_pretrained(
+        pipeline, revision=revision, token=token, cache_dir=cache
+    )
     if pretrained_pipeline is None:
         print(f"Could not load pretrained pipeline from {pipeline}.")
         raise typer.exit(code=1)
@@ -375,7 +391,6 @@ def apply(
     pretrained_pipeline.to(torch_device)
 
     if audio.is_dir():
-
         if into is None or not into.is_dir():
             typer.echo("When AUDIO is a directory, INTO must also be a directory.")
             raise typer.exit(code=1)
@@ -385,7 +400,6 @@ def apply(
         jsons: list[Path | None] = [into / (path.stem + ".json") for path in inputs]
 
     else:
-        
         if not (into is None or into.is_file()):
             typer.echo("When AUDIO is a file, INTO must also be a file.")
             raise typer.exit(code=1)
@@ -395,7 +409,6 @@ def apply(
         jsons: list[Path | None] = [into.with_suffix(".json") if into else None]
 
     for current_input, current_rttm, current_json in zip(inputs, rttms, jsons):
-
         prediction = pretrained_pipeline(current_input)
 
         speaker_diarization = get_diarization(prediction)
@@ -522,6 +535,27 @@ def benchmark(
             case_sensitive=False,
         ),
     ] = Subset.test,
+    revision: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Pretrained pipeline revision.",
+        ),
+    ] = None,
+    token: Annotated[
+        Optional[str],
+        typer.Argument(help="Huggingface token."),
+    ] = None,
+    cache: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Path to the folder where files downloaded from Huggingface are stored.",
+            exists=True,
+            dir_okay=True,
+            file_okay=False,
+            writable=True,
+            resolve_path=True,
+        ),
+    ] = None,
     device: Annotated[
         Device, typer.Option(help="Accelerator to use (CPU, CUDA, MPS)")
     ] = Device.AUTO,
@@ -538,17 +572,6 @@ def benchmark(
     num_speakers: Annotated[
         NumSpeakers, typer.Option(help="Number of speakers (oracle or auto)")
     ] = NumSpeakers.AUTO,
-    cache: Annotated[
-        Path,
-        typer.Option(
-            help="Path to the folder where files downloaded from Huggingface hub are stored.",
-            exists=True,
-            dir_okay=True,
-            file_okay=False,
-            writable=True,
-            resolve_path=True,
-        ),
-    ] = None,
     optimize: Annotated[
         bool,
         typer.Option(
@@ -562,10 +585,7 @@ def benchmark(
         ),
     ] = False,
     per_file: Annotated[
-        bool,
-        typer.Option(
-            help="Save one RTTM/JSON file per processed audio file."
-        )
+        bool, typer.Option(help="Save one RTTM/JSON file per processed audio file.")
     ] = False,
 ):
     """
@@ -578,7 +598,9 @@ def benchmark(
     """
 
     # load pretrained pipeline
-    pretrained_pipeline = Pipeline.from_pretrained(pipeline, cache_dir=cache)
+    pretrained_pipeline = Pipeline.from_pretrained(
+        pipeline, revision=revision, token=token, cache_dir=cache, 
+    )
     if pretrained_pipeline is None:
         print(f"Could not load pretrained pipeline from {pipeline}.")
         raise typer.exit(code=1)
@@ -808,15 +830,19 @@ def benchmark(
             yaml.dump({"min_duration_off": best_min_duration_off}, yml)
 
         if not per_file:
-            optimized_rttm_file = into / f"{benchmark_name}.OptimizedMinDurationOff.rttm"
+            optimized_rttm_file = (
+                into / f"{benchmark_name}.OptimizedMinDurationOff.rttm"
+            )
 
             # make sure we don't overwrite previous results
             if optimized_rttm_file.exists():
                 raise FileExistsError(f"{optimized_rttm_file} already exists.")
 
         for file in files:
             if per_file:
-                optimized_rttm_file = rttm_dir / f"{file['uri']}.OptimizedMinDurationOff.rttm"
+                optimized_rttm_file = (
+                    rttm_dir / f"{file['uri']}.OptimizedMinDurationOff.rttm"
+                )
 
             with open(optimized_rttm_file, "w" if per_file else "a") as rttm:
                 file["best_speaker_diarization"].write_rttm(rttm)
@@ -851,11 +877,11 @@ def strip(
     """
 
     keys = [
-        "pytorch-lightning_version",   # * pytorch-lightning needs
-        "hparams_name",                #   those values to initialize 
-        "hyper_parameters",            #   the model architecture
-        "state_dict",                  # * actual weights
-        "pyannote.audio",              # * pyannote.audio dependencies 
+        "pytorch-lightning_version",  # * pytorch-lightning needs
+        "hparams_name",  #   those values to initialize
+        "hyper_parameters",  #   the model architecture
+        "state_dict",  # * actual weights
+        "pyannote.audio",  # * pyannote.audio dependencies
     ]
 
     old_checkpoint = torch.load(
diff --git a/src/pyannote/audio/core/calibration.py b/src/pyannote/audio/core/calibration.py
@@ -40,6 +40,34 @@ class Calibration(IsotonicRegression):
     def __init__(self):
         super().__init__(y_min=0.0, y_max=1.0, increasing="auto", out_of_bounds="clip")
 
+    def safe_transform(
+        self,
+        values: np.ndarray,
+        nan_value: float = 2.0,
+    ) -> np.ndarray:
+        """Apply calibration handling NaN values and any shape gracefully
+        
+        Parameters
+        ----------
+        values : np.ndarray
+            Values to calibrate
+        nan_value : float, optional
+            Value to use in place of NaN values during calibration. Default is 2.0.
+
+        Returns
+        -------
+        calibrated_values : np.ndarray
+            Calibrated values
+        """
+        # temporarily replace NaN values with `nan_value` so `transform()` does not fail
+        transformed = np.nan_to_num(values.reshape(-1), nan=nan_value)
+
+        # apply calibration
+        transformed: np.ndarray = self.transform(transformed)
+
+        # recover original shape
+        return transformed.reshape(values.shape)
+
     def save(self, path: str):
         """Save fitted calibration to disk
 
diff --git a/src/pyannote/audio/core/model.py b/src/pyannote/audio/core/model.py
@@ -599,7 +599,9 @@ def default_map_location(storage, loc):
             map_location = default_map_location
 
         # load checkpoint using lightning
-        loaded_checkpoint = pl_load(path_to_model_checkpoint, map_location=map_location)
+        loaded_checkpoint = pl_load(
+            path_to_model_checkpoint, map_location=map_location, weights_only=False
+        )
 
         # check that the checkpoint is compatible with the current version
         versions = loaded_checkpoint["pyannote.audio"]["versions"]
@@ -620,6 +622,7 @@ def default_map_location(storage, loc):
                 path_to_model_checkpoint,
                 map_location=map_location,
                 strict=strict,
+                weights_only=False,
                 **kwargs,
             )
         except RuntimeError as e:
@@ -638,6 +641,7 @@ def default_map_location(storage, loc):
                     path_to_model_checkpoint,
                     map_location=map_location,
                     strict=False,
+                    weights_only=False,
                     **kwargs,
                 )
                 return model