@@ -16,14 +16,14 @@ class VadOptions:
16
16
"""VAD options.
17
17
18
18
Attributes:
19
- onset : Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
19
+ threshold : Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
20
20
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
21
21
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
22
- offset : Silence threshold for determining the end of speech. If a probability is lower than
23
- the offset , it is always considered silence. Values higher than offset are only considered
24
- speech if the previous sample was classified as speech; otherwise, they are treated as
25
- silence. This parameter helps refine the detection of speech transitions, ensuring smoother
26
- segment boundaries.
22
+ neg_threshold : Silence threshold for determining the end of speech. If a probability is lower
23
+ than neg_threshold , it is always considered silence. Values higher than neg_threshold
24
+ are only considered speech if the previous sample was classified as speech; otherwise,
25
+ they are treated as silence. This parameter helps refine the detection of speech
26
+ transitions, ensuring smoother segment boundaries.
27
27
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
28
28
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
29
29
than max_speech_duration_s will be split at the timestamp of the last silence that
@@ -34,8 +34,8 @@ class VadOptions:
34
34
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
35
35
"""
36
36
37
- onset : float = 0.5
38
- offset : float = onset - 0.15
37
+ threshold : float = 0.5
38
+ neg_threshold : float = threshold - 0.15
39
39
min_speech_duration_ms : int = 0
40
40
max_speech_duration_s : float = float ("inf" )
41
41
min_silence_duration_ms : int = 2000
@@ -62,7 +62,7 @@ def get_speech_timestamps(
62
62
if vad_options is None :
63
63
vad_options = VadOptions (** kwargs )
64
64
65
- onset = vad_options .onset
65
+ threshold = vad_options .threshold
66
66
min_speech_duration_ms = vad_options .min_speech_duration_ms
67
67
max_speech_duration_s = vad_options .max_speech_duration_s
68
68
min_silence_duration_ms = vad_options .min_silence_duration_ms
@@ -90,20 +90,20 @@ def get_speech_timestamps(
90
90
triggered = False
91
91
speeches = []
92
92
current_speech = {}
93
- offset = vad_options .offset
93
+ neg_threshold = vad_options .neg_threshold
94
94
95
95
# to save potential segment end (and tolerate some silence)
96
96
temp_end = 0
97
97
# to save potential segment limits in case of maximum segment size reached
98
98
prev_end = next_start = 0
99
99
100
100
for i , speech_prob in enumerate (speech_probs ):
101
- if (speech_prob >= onset ) and temp_end :
101
+ if (speech_prob >= threshold ) and temp_end :
102
102
temp_end = 0
103
103
if next_start < prev_end :
104
104
next_start = window_size_samples * i
105
105
106
- if (speech_prob >= onset ) and not triggered :
106
+ if (speech_prob >= threshold ) and not triggered :
107
107
triggered = True
108
108
current_speech ["start" ] = window_size_samples * i
109
109
continue
@@ -130,7 +130,7 @@ def get_speech_timestamps(
130
130
triggered = False
131
131
continue
132
132
133
- if (speech_prob < offset ) and triggered :
133
+ if (speech_prob < neg_threshold ) and triggered :
134
134
if not temp_end :
135
135
temp_end = window_size_samples * i
136
136
# condition to avoid cutting in very short silence
0 commit comments