1
1
defmodule Membrane.RTP.VAD do
2
2
@ moduledoc """
3
- Simple vad based on audio level sent in RTP header.
3
+ Vad based on audio level sent in RTP header.
4
4
5
- To make this module work appropriate RTP header extension has to be set in SDP offer/answer.
5
+ To make this module work appropriate RTP header extension has to be set in the SDP offer/answer.
6
6
7
- If avg of audio level in packets in `time_window` exceeds `vad_threshold` it emits `Membrane.RTP.VadEvent`
8
- on its output pad.
7
+ Sends `Membrane.RTP.VadEvent` when a score from `Membrane.RTP.Vad.IsSpeakingEstimator` changes.
9
8
10
- When avg falls below `vad_threshold` and doesn't exceed it in the next `vad_silence_timer`
11
- it also emits the event.
9
+ A more detailed explanation of how the VAD algorithm can be found in the `Membrane.RTP.Vad.IsSpeakingEstimator` module.
12
10
13
11
Buffers that are processed by this element may or may not have been processed by
14
12
a depayloader and passed through a jitter buffer. If they have not, then the only timestamp
15
- available for time comparison is the RTP timestamp. The delta between RTP timestamps is
16
- dependent on the clock rate used by the encoding. For `OPUS` the clock rate is `48kHz` and
17
- packets are sent every `20ms`, so the RTP timestamp delta between sequential packets should
18
- be `48000 / 1000 * 20`, or `960`.
13
+ available for time comparison is the RTP timestamp.
19
14
20
15
When calculating the epoch of the timestamp, we need to account for 32bit integer wrapping.
21
16
* `:current` - the difference between timestamps is low: the timestamp has not wrapped around.
@@ -26,6 +21,7 @@ defmodule Membrane.RTP.VAD do
26
21
use Membrane.Filter
27
22
28
23
alias Membrane.RTP . { Header , Utils , VadEvent }
24
+ alias Membrane.RTP.Vad . { AudioLevelQueue , IsSpeakingEstimator }
29
25
30
26
def_input_pad :input , availability: :always , accepted_format: _any , demand_mode: :auto
31
27
@@ -35,59 +31,24 @@ defmodule Membrane.RTP.VAD do
35
31
spec: 1 .. 14 ,
36
32
description: "ID of VAD header extension."
37
33
] ,
38
- clock_rate: [
39
- spec: Membrane.RTP . clock_rate_t ( ) ,
40
- default: 48_000 ,
41
- description: "Clock rate (in `Hz`) for the encoding."
42
- ] ,
43
- time_window: [
44
- spec: pos_integer ( ) ,
45
- default: 2_000 ,
46
- description: "Time window (in `ms`) in which avg audio level is measured."
47
- ] ,
48
- min_packet_num: [
49
- spec: pos_integer ( ) ,
50
- default: 50 ,
51
- description: """
52
- Minimal number of packets to count avg audio level from.
53
- Speech won't be detected until there are enough packets.
54
- """
55
- ] ,
56
34
vad_threshold: [
57
35
spec: - 127 .. 0 ,
58
- default: - 50 ,
36
+ default: - 32 ,
59
37
description: """
60
38
Audio level in dBov representing vad threshold.
61
39
Values above are considered to represent voice activity.
62
40
Value -127 represents digital silence.
63
41
"""
64
- ] ,
65
- vad_silence_time: [
66
- spec: pos_integer ( ) ,
67
- default: 300 ,
68
- description: """
69
- Time to wait before emitting `Membrane.RTP.VadEvent` after audio track is
70
- no longer considered to represent speech.
71
- If at this time audio track is considered to represent speech again the event will not be sent.
72
- """
73
42
]
74
43
75
44
@ impl true
76
45
def handle_init ( _ctx , opts ) do
77
46
state = % {
78
47
vad_id: opts . vad_id ,
79
- audio_levels: Qex . new ( ) ,
80
- clock_rate: opts . clock_rate ,
48
+ audio_levels: AudioLevelQueue . new ( ) ,
81
49
vad: :silence ,
82
- vad_silence_timestamp: 0 ,
83
50
current_timestamp: nil ,
84
- rtp_timestamp_increment: opts . time_window * opts . clock_rate / 1000 ,
85
- min_packet_num: opts . min_packet_num ,
86
- time_window: opts . time_window ,
87
- vad_threshold: opts . vad_threshold ,
88
- vad_silence_time: opts . vad_silence_time ,
89
- audio_levels_sum: 0 ,
90
- audio_levels_count: 0
51
+ vad_threshold: opts . vad_threshold + 127
91
52
}
92
53
93
54
{ [ ] , state }
@@ -130,87 +91,34 @@ defmodule Membrane.RTP.VAD do
130
91
end
131
92
end
132
93
133
- defp handle_vad ( buffer , rtp_timestamp , level , state ) do
134
- state = % { state | current_timestamp: rtp_timestamp }
135
- state = filter_old_audio_levels ( state )
136
- state = add_new_audio_level ( state , level )
137
- audio_levels_vad = get_audio_levels_vad ( state )
138
- actions = [ buffer: { :output , buffer } ] ++ maybe_send_event ( audio_levels_vad , state )
139
- state = update_vad_state ( audio_levels_vad , state )
140
- { actions , state }
141
- end
94
+ defp handle_vad ( buffer , rtp_timestamp , level_in_dbov , state ) do
95
+ level_in_db = 127 - level_in_dbov
96
+ updated_audio_levels = AudioLevelQueue . add ( state . audio_levels , level_in_db )
142
97
143
- defp filter_old_audio_levels ( state ) do
144
- Enum . reduce_while ( state . audio_levels , state , fn { level , timestamp } , state ->
145
- if Ratio . sub ( state . current_timestamp , timestamp )
146
- |> Ratio . gt? ( state . rtp_timestamp_increment ) do
147
- { _level , audio_levels } = Qex . pop ( state . audio_levels )
148
-
149
- state = % {
150
- state
151
- | audio_levels_sum: state . audio_levels_sum - level ,
152
- audio_levels_count: state . audio_levels_count - 1 ,
153
- audio_levels: audio_levels
154
- }
155
-
156
- { :cont , state }
157
- else
158
- { :halt , state }
159
- end
160
- end )
161
- end
98
+ vad_estimation =
99
+ updated_audio_levels
100
+ |> AudioLevelQueue . to_list ( )
101
+ |> IsSpeakingEstimator . estimate_is_speaking ( state . vad_threshold )
162
102
163
- defp add_new_audio_level ( state , level ) do
164
- audio_levels = Qex . push ( state . audio_levels , { - level , state . current_timestamp } )
103
+ actions = [ buffer: { :output , buffer } ] ++ maybe_send_event ( vad_estimation , state )
165
104
166
- % {
105
+ state = % {
167
106
state
168
- | audio_levels: audio_levels ,
169
- audio_levels_sum: state . audio_levels_sum + - level ,
170
- audio_levels_count: state . audio_levels_count + 1
107
+ | current_timestamp: rtp_timestamp ,
108
+ audio_levels: updated_audio_levels ,
109
+ vad: vad_estimation
171
110
}
172
- end
173
111
174
- defp get_audio_levels_vad ( state ) do
175
- if state . audio_levels_count >= state . min_packet_num and avg ( state ) >= state . vad_threshold ,
176
- do: :speech ,
177
- else: :silence
112
+ { actions , state }
178
113
end
179
114
180
- defp avg ( state ) , do: state . audio_levels_sum / state . audio_levels_count
181
-
182
115
defp maybe_send_event ( audio_levels_vad , state ) do
183
- if vad_silence? ( audio_levels_vad , state ) or vad_speech? ( audio_levels_vad , state ) do
116
+ if vad_state_has_changed ( state . vad , audio_levels_vad ) do
184
117
[ event: { :output , % VadEvent { vad: audio_levels_vad } } ]
185
118
else
186
119
[ ]
187
120
end
188
121
end
189
122
190
- defp update_vad_state ( audio_levels_vad , state ) do
191
- cond do
192
- vad_maybe_silence? ( audio_levels_vad , state ) ->
193
- Map . merge ( state , % { vad: :maybe_silence , vad_silence_timestamp: state . current_timestamp } )
194
-
195
- vad_silence? ( audio_levels_vad , state ) or vad_speech? ( audio_levels_vad , state ) ->
196
- Map . merge ( state , % { vad: audio_levels_vad } )
197
-
198
- true ->
199
- state
200
- end
201
- end
202
-
203
- defp vad_silence? ( audio_levels_vad , state ) ,
204
- do: state . vad == :maybe_silence and audio_levels_vad == :silence and timer_expired? ( state )
205
-
206
- defp vad_speech? ( audio_levels_vad , state ) do
207
- ( state . vad == :maybe_silence and audio_levels_vad == :speech ) or
208
- ( state . vad == :silence and audio_levels_vad == :speech )
209
- end
210
-
211
- defp vad_maybe_silence? ( audio_levels_vad , state ) ,
212
- do: state . vad == :speech and audio_levels_vad == :silence
213
-
214
- defp timer_expired? ( state ) ,
215
- do: state . current_timestamp - state . vad_silence_timestamp > state . vad_silence_time
123
+ defp vad_state_has_changed ( old_value , new_value ) , do: old_value != new_value
216
124
end
0 commit comments