Skip to content

Commit 8327d8c

Browse files
authored
Brings back original VAD parameters naming (#1181)
1 parent 22a5238 commit 8327d8c

File tree

1 file changed

+13
-13
lines changed

1 file changed

+13
-13
lines changed

faster_whisper/vad.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@ class VadOptions:
1616
"""VAD options.
1717
1818
Attributes:
19-
onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
19+
threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
2020
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
2121
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
22-
offset: Silence threshold for determining the end of speech. If a probability is lower than
23-
the offset, it is always considered silence. Values higher than offset are only considered
24-
speech if the previous sample was classified as speech; otherwise, they are treated as
25-
silence. This parameter helps refine the detection of speech transitions, ensuring smoother
26-
segment boundaries.
22+
neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
23+
than neg_threshold, it is always considered silence. Values higher than neg_threshold
24+
are only considered speech if the previous sample was classified as speech; otherwise,
25+
they are treated as silence. This parameter helps refine the detection of speech
26+
transitions, ensuring smoother segment boundaries.
2727
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
2828
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
2929
than max_speech_duration_s will be split at the timestamp of the last silence that
@@ -34,8 +34,8 @@ class VadOptions:
3434
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
3535
"""
3636

37-
onset: float = 0.5
38-
offset: float = onset - 0.15
37+
threshold: float = 0.5
38+
neg_threshold: float = threshold - 0.15
3939
min_speech_duration_ms: int = 0
4040
max_speech_duration_s: float = float("inf")
4141
min_silence_duration_ms: int = 2000
@@ -62,7 +62,7 @@ def get_speech_timestamps(
6262
if vad_options is None:
6363
vad_options = VadOptions(**kwargs)
6464

65-
onset = vad_options.onset
65+
threshold = vad_options.threshold
6666
min_speech_duration_ms = vad_options.min_speech_duration_ms
6767
max_speech_duration_s = vad_options.max_speech_duration_s
6868
min_silence_duration_ms = vad_options.min_silence_duration_ms
@@ -90,20 +90,20 @@ def get_speech_timestamps(
9090
triggered = False
9191
speeches = []
9292
current_speech = {}
93-
offset = vad_options.offset
93+
neg_threshold = vad_options.neg_threshold
9494

9595
# to save potential segment end (and tolerate some silence)
9696
temp_end = 0
9797
# to save potential segment limits in case of maximum segment size reached
9898
prev_end = next_start = 0
9999

100100
for i, speech_prob in enumerate(speech_probs):
101-
if (speech_prob >= onset) and temp_end:
101+
if (speech_prob >= threshold) and temp_end:
102102
temp_end = 0
103103
if next_start < prev_end:
104104
next_start = window_size_samples * i
105105

106-
if (speech_prob >= onset) and not triggered:
106+
if (speech_prob >= threshold) and not triggered:
107107
triggered = True
108108
current_speech["start"] = window_size_samples * i
109109
continue
@@ -130,7 +130,7 @@ def get_speech_timestamps(
130130
triggered = False
131131
continue
132132

133-
if (speech_prob < offset) and triggered:
133+
if (speech_prob < neg_threshold) and triggered:
134134
if not temp_end:
135135
temp_end = window_size_samples * i
136136
# condition to avoid cutting in very short silence

0 commit comments

Comments
 (0)