Brings back original VAD parameters naming (#1181)

Purfview · web-flow · commit 8327d8cc6472 · 2024-12-01T20:41:53.000+03:00
diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py
@@ -16,14 +16,14 @@ class VadOptions:
     """VAD options.
 
     Attributes:
-      onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
+      threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
         probabilities ABOVE this value are considered as SPEECH. It is better to tune this
         parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
-      offset: Silence threshold for determining the end of speech. If a probability is lower than
-        the offset, it is always considered silence. Values higher than offset are only considered
-        speech if the previous sample was classified as speech; otherwise, they are treated as
-        silence. This parameter helps refine the detection of speech transitions, ensuring smoother
-        segment boundaries.
+      neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
+        than neg_threshold, it is always considered silence. Values higher than neg_threshold
+        are only considered speech if the previous sample was classified as speech; otherwise,
+        they are treated as silence. This parameter helps refine the detection of speech
+         transitions, ensuring smoother segment boundaries.
       min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
       max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
         than max_speech_duration_s will be split at the timestamp of the last silence that
@@ -34,8 +34,8 @@ class VadOptions:
       speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
     """
 
-    onset: float = 0.5
-    offset: float = onset - 0.15
+    threshold: float = 0.5
+    neg_threshold: float = threshold - 0.15
     min_speech_duration_ms: int = 0
     max_speech_duration_s: float = float("inf")
     min_silence_duration_ms: int = 2000
@@ -62,7 +62,7 @@ def get_speech_timestamps(
     if vad_options is None:
         vad_options = VadOptions(**kwargs)
 
-    onset = vad_options.onset
+    threshold = vad_options.threshold
     min_speech_duration_ms = vad_options.min_speech_duration_ms
     max_speech_duration_s = vad_options.max_speech_duration_s
     min_silence_duration_ms = vad_options.min_silence_duration_ms
@@ -90,20 +90,20 @@ def get_speech_timestamps(
     triggered = False
     speeches = []
     current_speech = {}
-    offset = vad_options.offset
+    neg_threshold = vad_options.neg_threshold
 
     # to save potential segment end (and tolerate some silence)
     temp_end = 0
     # to save potential segment limits in case of maximum segment size reached
     prev_end = next_start = 0
 
     for i, speech_prob in enumerate(speech_probs):
-        if (speech_prob >= onset) and temp_end:
+        if (speech_prob >= threshold) and temp_end:
             temp_end = 0
             if next_start < prev_end:
                 next_start = window_size_samples * i
 
-        if (speech_prob >= onset) and not triggered:
+        if (speech_prob >= threshold) and not triggered:
             triggered = True
             current_speech["start"] = window_size_samples * i
             continue
@@ -130,7 +130,7 @@ def get_speech_timestamps(
                 triggered = False
                 continue
 
-        if (speech_prob < offset) and triggered:
+        if (speech_prob < neg_threshold) and triggered:
             if not temp_end:
                 temp_end = window_size_samples * i
             # condition to avoid cutting in very short silence