Skip to content

Commit 2b07ed2

Browse files
Mv 309 implement vad (from master branch) (#148)
* VAD implementation based on a paper * Cleaning old vad algorithm code * `is_speaking_esimator` and its tests * Writing VAD tests for new implementation * Module docs * Adding AudioLevelsQueue and its tests --------- Co-authored-by: Bartosz Błaszków <[email protected]>
1 parent db9ffbf commit 2b07ed2

12 files changed

+617
-225
lines changed

.vscode/settings.json

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cSpell.words": [
3+
"dbov",
4+
"immediates"
5+
]
6+
}

config/config.exs

+23-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
11
import Config
22

3-
config :membrane_rtp_plugin, :fir_throttle_duration_ms, 500
3+
config :membrane_rtp_plugin,
4+
fir_throttle_duration_ms: 500,
5+
vad_estimation_parameters: %{
6+
immediate: %{
7+
subunits: 1,
8+
score_threshold: 0,
9+
lambda: 1
10+
},
11+
medium: %{
12+
subunits: 10,
13+
score_threshold: 20,
14+
subunit_threshold: 1,
15+
lambda: 24
16+
},
17+
long: %{
18+
subunits: 7,
19+
score_threshold: 20,
20+
subunit_threshold: 3,
21+
lambda: 47
22+
}
23+
}
24+
25+
import_config "#{config_env()}.exs"

config/dev.exs

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import Config
2+
3+
# place for dev compile time env variables

config/prod.exs

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import Config
2+
3+
# place for production compile time env variables

config/test.exs

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import Config
2+
3+
config :membrane_rtp_plugin,
4+
vad_estimation_parameters: %{
5+
immediate: %{
6+
subunits: 2,
7+
score_threshold: 0.1,
8+
lambda: 1
9+
},
10+
medium: %{
11+
subunits: 2,
12+
score_threshold: 0.1,
13+
subunit_threshold: 2,
14+
lambda: 1
15+
},
16+
long: %{
17+
subunits: 2,
18+
score_threshold: 0.1,
19+
subunit_threshold: 1,
20+
lambda: 1
21+
}
22+
}

lib/membrane/rtp/vad.ex

+24-116
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,16 @@
11
defmodule Membrane.RTP.VAD do
22
@moduledoc """
3-
Simple vad based on audio level sent in RTP header.
3+
Vad based on audio level sent in RTP header.
44
5-
To make this module work appropriate RTP header extension has to be set in SDP offer/answer.
5+
To make this module work appropriate RTP header extension has to be set in the SDP offer/answer.
66
7-
If avg of audio level in packets in `time_window` exceeds `vad_threshold` it emits `Membrane.RTP.VadEvent`
8-
on its output pad.
7+
Sends `Membrane.RTP.VadEvent` when a score from `Membrane.RTP.Vad.IsSpeakingEstimator` changes.
98
10-
When avg falls below `vad_threshold` and doesn't exceed it in the next `vad_silence_timer`
11-
it also emits the event.
9+
A more detailed explanation of how the VAD algorithm can be found in the `Membrane.RTP.Vad.IsSpeakingEstimator` module.
1210
1311
Buffers that are processed by this element may or may not have been processed by
1412
a depayloader and passed through a jitter buffer. If they have not, then the only timestamp
15-
available for time comparison is the RTP timestamp. The delta between RTP timestamps is
16-
dependent on the clock rate used by the encoding. For `OPUS` the clock rate is `48kHz` and
17-
packets are sent every `20ms`, so the RTP timestamp delta between sequential packets should
18-
be `48000 / 1000 * 20`, or `960`.
13+
available for time comparison is the RTP timestamp.
1914
2015
When calculating the epoch of the timestamp, we need to account for 32bit integer wrapping.
2116
* `:current` - the difference between timestamps is low: the timestamp has not wrapped around.
@@ -26,6 +21,7 @@ defmodule Membrane.RTP.VAD do
2621
use Membrane.Filter
2722

2823
alias Membrane.RTP.{Header, Utils, VadEvent}
24+
alias Membrane.RTP.Vad.{AudioLevelQueue, IsSpeakingEstimator}
2925

3026
def_input_pad :input, availability: :always, accepted_format: _any, demand_mode: :auto
3127

@@ -35,59 +31,24 @@ defmodule Membrane.RTP.VAD do
3531
spec: 1..14,
3632
description: "ID of VAD header extension."
3733
],
38-
clock_rate: [
39-
spec: Membrane.RTP.clock_rate_t(),
40-
default: 48_000,
41-
description: "Clock rate (in `Hz`) for the encoding."
42-
],
43-
time_window: [
44-
spec: pos_integer(),
45-
default: 2_000,
46-
description: "Time window (in `ms`) in which avg audio level is measured."
47-
],
48-
min_packet_num: [
49-
spec: pos_integer(),
50-
default: 50,
51-
description: """
52-
Minimal number of packets to count avg audio level from.
53-
Speech won't be detected until there are enough packets.
54-
"""
55-
],
5634
vad_threshold: [
5735
spec: -127..0,
58-
default: -50,
36+
default: -32,
5937
description: """
6038
Audio level in dBov representing vad threshold.
6139
Values above are considered to represent voice activity.
6240
Value -127 represents digital silence.
6341
"""
64-
],
65-
vad_silence_time: [
66-
spec: pos_integer(),
67-
default: 300,
68-
description: """
69-
Time to wait before emitting `Membrane.RTP.VadEvent` after audio track is
70-
no longer considered to represent speech.
71-
If at this time audio track is considered to represent speech again the event will not be sent.
72-
"""
7342
]
7443

7544
@impl true
7645
def handle_init(_ctx, opts) do
7746
state = %{
7847
vad_id: opts.vad_id,
79-
audio_levels: Qex.new(),
80-
clock_rate: opts.clock_rate,
48+
audio_levels: AudioLevelQueue.new(),
8149
vad: :silence,
82-
vad_silence_timestamp: 0,
8350
current_timestamp: nil,
84-
rtp_timestamp_increment: opts.time_window * opts.clock_rate / 1000,
85-
min_packet_num: opts.min_packet_num,
86-
time_window: opts.time_window,
87-
vad_threshold: opts.vad_threshold,
88-
vad_silence_time: opts.vad_silence_time,
89-
audio_levels_sum: 0,
90-
audio_levels_count: 0
51+
vad_threshold: opts.vad_threshold + 127
9152
}
9253

9354
{[], state}
@@ -130,87 +91,34 @@ defmodule Membrane.RTP.VAD do
13091
end
13192
end
13293

133-
defp handle_vad(buffer, rtp_timestamp, level, state) do
134-
state = %{state | current_timestamp: rtp_timestamp}
135-
state = filter_old_audio_levels(state)
136-
state = add_new_audio_level(state, level)
137-
audio_levels_vad = get_audio_levels_vad(state)
138-
actions = [buffer: {:output, buffer}] ++ maybe_send_event(audio_levels_vad, state)
139-
state = update_vad_state(audio_levels_vad, state)
140-
{actions, state}
141-
end
94+
defp handle_vad(buffer, rtp_timestamp, level_in_dbov, state) do
95+
level_in_db = 127 - level_in_dbov
96+
updated_audio_levels = AudioLevelQueue.add(state.audio_levels, level_in_db)
14297

143-
defp filter_old_audio_levels(state) do
144-
Enum.reduce_while(state.audio_levels, state, fn {level, timestamp}, state ->
145-
if Ratio.sub(state.current_timestamp, timestamp)
146-
|> Ratio.gt?(state.rtp_timestamp_increment) do
147-
{_level, audio_levels} = Qex.pop(state.audio_levels)
148-
149-
state = %{
150-
state
151-
| audio_levels_sum: state.audio_levels_sum - level,
152-
audio_levels_count: state.audio_levels_count - 1,
153-
audio_levels: audio_levels
154-
}
155-
156-
{:cont, state}
157-
else
158-
{:halt, state}
159-
end
160-
end)
161-
end
98+
vad_estimation =
99+
updated_audio_levels
100+
|> AudioLevelQueue.to_list()
101+
|> IsSpeakingEstimator.estimate_is_speaking(state.vad_threshold)
162102

163-
defp add_new_audio_level(state, level) do
164-
audio_levels = Qex.push(state.audio_levels, {-level, state.current_timestamp})
103+
actions = [buffer: {:output, buffer}] ++ maybe_send_event(vad_estimation, state)
165104

166-
%{
105+
state = %{
167106
state
168-
| audio_levels: audio_levels,
169-
audio_levels_sum: state.audio_levels_sum + -level,
170-
audio_levels_count: state.audio_levels_count + 1
107+
| current_timestamp: rtp_timestamp,
108+
audio_levels: updated_audio_levels,
109+
vad: vad_estimation
171110
}
172-
end
173111

174-
defp get_audio_levels_vad(state) do
175-
if state.audio_levels_count >= state.min_packet_num and avg(state) >= state.vad_threshold,
176-
do: :speech,
177-
else: :silence
112+
{actions, state}
178113
end
179114

180-
defp avg(state), do: state.audio_levels_sum / state.audio_levels_count
181-
182115
defp maybe_send_event(audio_levels_vad, state) do
183-
if vad_silence?(audio_levels_vad, state) or vad_speech?(audio_levels_vad, state) do
116+
if vad_state_has_changed(state.vad, audio_levels_vad) do
184117
[event: {:output, %VadEvent{vad: audio_levels_vad}}]
185118
else
186119
[]
187120
end
188121
end
189122

190-
defp update_vad_state(audio_levels_vad, state) do
191-
cond do
192-
vad_maybe_silence?(audio_levels_vad, state) ->
193-
Map.merge(state, %{vad: :maybe_silence, vad_silence_timestamp: state.current_timestamp})
194-
195-
vad_silence?(audio_levels_vad, state) or vad_speech?(audio_levels_vad, state) ->
196-
Map.merge(state, %{vad: audio_levels_vad})
197-
198-
true ->
199-
state
200-
end
201-
end
202-
203-
defp vad_silence?(audio_levels_vad, state),
204-
do: state.vad == :maybe_silence and audio_levels_vad == :silence and timer_expired?(state)
205-
206-
defp vad_speech?(audio_levels_vad, state) do
207-
(state.vad == :maybe_silence and audio_levels_vad == :speech) or
208-
(state.vad == :silence and audio_levels_vad == :speech)
209-
end
210-
211-
defp vad_maybe_silence?(audio_levels_vad, state),
212-
do: state.vad == :speech and audio_levels_vad == :silence
213-
214-
defp timer_expired?(state),
215-
do: state.current_timestamp - state.vad_silence_timestamp > state.vad_silence_time
123+
defp vad_state_has_changed(old_value, new_value), do: old_value != new_value
216124
end
+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
defmodule Membrane.RTP.Vad.AudioLevelQueue do
2+
@moduledoc false
3+
4+
# The queue contains audio levels for VAD implementation. It is used as an input of IsSpeakingEstimator.estimate_is_speaking.
5+
# This structure builds on top of a simple FIFO Erlang queue by having a fixed max number of elements.
6+
7+
# The newest element in always appended to the front and popped out from its rear, so `to_list/1` returns the most recent element as the head of a list.
8+
# The length of a list can be obtained in O(1) time.
9+
10+
alias Membrane.RTP.Vad.VadParams
11+
12+
@target_audio_level_length VadParams.target_levels_length()
13+
14+
@enforce_keys [:levels, :length]
15+
defstruct [:levels, :length]
16+
17+
@typedoc """
18+
A type for storing information about a fixed number of recent audio levels.
19+
20+
`:levels` - erlang queue which stores at most @target_audio_level_length elements
21+
`:length` - number of elements
22+
"""
23+
24+
@type t() :: %__MODULE__{
25+
levels: :queue.queue(non_neg_integer()),
26+
length: non_neg_integer()
27+
}
28+
29+
@doc """
30+
Creates new AudioLevelQueue.
31+
"""
32+
@spec new(Enum.t()) :: t()
33+
def new(init_data \\ []) do
34+
levels =
35+
init_data
36+
|> Enum.take(@target_audio_level_length)
37+
|> Enum.to_list()
38+
|> :queue.from_list()
39+
40+
%__MODULE__{levels: levels, length: :queue.len(levels)}
41+
end
42+
43+
@doc """
44+
Given a AudioLevelQueue and level value it returns a queue with the level value on front
45+
46+
The function also reduces the size of the queue if the maximum size has been reached.
47+
It does so by dropping the oldest level.
48+
"""
49+
@spec add(t(), non_neg_integer) :: t()
50+
def add(%__MODULE__{length: @target_audio_level_length} = old_queue, level) do
51+
levels = :queue.in_r(level, :queue.drop_r(old_queue.levels))
52+
%__MODULE__{old_queue | levels: levels}
53+
end
54+
55+
def add(%__MODULE__{levels: old_levels, length: length}, level) do
56+
%__MODULE__{levels: :queue.in_r(level, old_levels), length: length + 1}
57+
end
58+
59+
@doc """
60+
Given an AudioLevelQueue it returns a list.
61+
"""
62+
@spec to_list(t()) :: [non_neg_integer()]
63+
def to_list(%__MODULE__{levels: levels}), do: :queue.to_list(levels)
64+
end

0 commit comments

Comments
 (0)