Skip to content

Commit 13da032

Browse files
santhoshvaihiroshihorieclaude
authored
feat: send native speech activity events (#29)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit ## Release Notes * **New Features** * Added AudioDeviceModule API for controlling audio playback, recording, voice processing, microphone muting, and ducking settings. * Introduced speech activity detection with event notifications when speech starts or ends. * Added audio device event listeners for monitoring audio activity and processing states. * **Chores** * Updated version to 137.1.4-alpha.5. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: Hiroshi Horie <548776+hiroshihorie@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c40fee0 commit 13da032

13 files changed

Lines changed: 611 additions & 10 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ WebRTC.xcframework
1212
WebRTC.dSYMs
1313
examples/GumTestApp/package-lock.json
1414
examples/GumTestApp_macOS/package-lock.json
15+
**/.xcode.env.local
16+
**/PLAN.md
1517
*.jar
1618
*.tgz
1719
*.zip
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
package com.oney.WebRTCModule;
2+
3+
import java.nio.ByteBuffer;
4+
import java.nio.ByteOrder;
5+
import java.nio.ShortBuffer;
6+
7+
/**
8+
* Tells you when the user is talking, by watching how loud the mic is over time.
9+
*
10+
* <p>How it works:
11+
* <ol>
12+
* <li>Every ~10 ms the mic gives us a chunk of samples.</li>
13+
* <li>Convert each chunk to one "loudness" number in dBFS (decibels relative
14+
* to full scale): quiet room ≈ -60 dB, normal speech ≈ -30 to -20 dB,
15+
* speaking close to the mic ≈ -15 to -10 dB.</li>
16+
* <li>Track two things only: <b>when we last saw a loud chunk</b> and
17+
* <b>when the current run of loud chunks started</b>.</li>
18+
* <li>Fire {@code onSpeechStarted} once we've had loud chunks for
19+
* {@link #START_CONFIRM_MS} in a row. Fire {@code onSpeechEnded} once
20+
* {@link #SILENCE_TIMEOUT_MS} has passed with no loud chunks. The
21+
* timeout is long enough to span natural between-word pauses.</li>
22+
* </ol>
23+
*
24+
* <p><b>Why this, not a rolling dB average?</b> Android's AGC (automatic gain
25+
* control) ramps the mic gain back up the instant speech stops, amplifying
26+
* room noise to -35 or -40 dB. A rolling average over that noise never drops
27+
* below the threshold, so {@code onSpeechEnded} would never fire. Looking at
28+
* "time since last loud peak" is immune to that — pauses between words are
29+
* short, but a real stop is sustained.
30+
*
31+
* <p><b>Alignment with stream-video-android.</b> stream-video-android's
32+
* {@code SoundInputProcessor} fires only an "edge-up" callback and relies on
33+
* the app layer to infer "stopped". We need the {@code ended} edge to match
34+
* the iOS contract, so we add the silence-timeout inference here using the
35+
* same {@code -45 dBFS} threshold they use.
36+
*
37+
* <p><b>Not "real" voice recognition.</b> This only looks at energy/loudness,
38+
* not voice features. Loud non-voice sounds (typing, door slams, music) will
39+
* trigger {@code onSpeechStarted}. iOS uses Apple's hardware VAD which is
40+
* smarter, but Android has no equivalent — same tradeoff stream-video-android
41+
* lives with.
42+
*
43+
* <p>Thread-safety: single-threaded — only the WebRTC audio thread should call
44+
* {@link #processBuffer}. Listener callbacks fire synchronously on that thread;
45+
* the listener is responsible for dispatching to the JS thread.
46+
*/
47+
class SpeechActivityDetector {
48+
49+
interface Listener {
50+
void onSpeechStarted();
51+
void onSpeechEnded();
52+
}
53+
54+
/** Above this dBFS level a chunk counts as "loud". Matches stream-video-android. */
55+
private static final double THRESHOLD_DB = -45.0;
56+
/** Require loud chunks for this long before firing started (rejects door slams). */
57+
private static final long START_CONFIRM_MS = 150;
58+
/** Fire ended after this long with no loud chunk (spans natural between-word pauses). */
59+
private static final long SILENCE_TIMEOUT_MS = 900;
60+
61+
private final Listener listener;
62+
63+
private boolean isSpeaking = false;
64+
/** Start of the current run of above-threshold chunks, or -1 if last chunk was quiet. */
65+
private long firstLoudMs = -1;
66+
/** Last time any chunk was above threshold, or -1 if never (or cleared on ended). */
67+
private long lastLoudMs = -1;
68+
69+
SpeechActivityDetector(Listener listener) {
70+
this.listener = listener;
71+
}
72+
73+
/**
74+
* Feed one mic chunk through the detector. Reads PCM16 LE samples from
75+
* {@code audioBuffer} without mutating its position/limit. May fire a
76+
* listener callback synchronously if state flips.
77+
*
78+
* <p>Must be called on the WebRTC audio thread, BEFORE any code that mutates
79+
* {@code audioBuffer} (e.g. screen-audio mixing) — otherwise the detector
80+
* sees post-mix audio and triggers on system sounds.
81+
*/
82+
void processBuffer(ByteBuffer audioBuffer, int bytesRead) {
83+
if (bytesRead <= 0) {
84+
return;
85+
}
86+
87+
// Work on a duplicate so we never mutate the caller's position/limit.
88+
ByteBuffer buf = audioBuffer.duplicate();
89+
buf.position(0);
90+
buf.limit(bytesRead);
91+
buf.order(ByteOrder.LITTLE_ENDIAN);
92+
ShortBuffer shorts = buf.asShortBuffer();
93+
94+
int numSamples = shorts.remaining();
95+
if (numSamples == 0) {
96+
return;
97+
}
98+
99+
// Normalize int16 samples to [-1.0, 1.0] BEFORE squaring so the resulting
100+
// dB value is dBFS (decibels relative to full scale). Without this, dB is
101+
// computed against a 1-sample-unit reference and silence reads as ~+40.
102+
double sumSquares = 0;
103+
for (int i = 0; i < numSamples; i++) {
104+
double sample = shorts.get(i) / (double) Short.MAX_VALUE;
105+
sumSquares += sample * sample;
106+
}
107+
108+
double rms = Math.sqrt(sumSquares / numSamples);
109+
double db = (rms > 0) ? 20.0 * Math.log10(rms) : -100.0;
110+
111+
long now = System.currentTimeMillis();
112+
113+
if (db > THRESHOLD_DB) {
114+
// Loud chunk. Open a start window if one isn't already open, and
115+
// remember this as the most recent loud chunk for ended timing.
116+
lastLoudMs = now;
117+
if (firstLoudMs < 0) {
118+
firstLoudMs = now;
119+
}
120+
if (!isSpeaking && now - firstLoudMs >= START_CONFIRM_MS) {
121+
isSpeaking = true;
122+
listener.onSpeechStarted();
123+
}
124+
} else {
125+
// Quiet chunk. Cancel any in-progress start confirmation. If we're
126+
// already speaking, fire ended once the silence is long enough.
127+
firstLoudMs = -1;
128+
if (isSpeaking && lastLoudMs > 0 && now - lastLoudMs >= SILENCE_TIMEOUT_MS) {
129+
isSpeaking = false;
130+
lastLoudMs = -1;
131+
listener.onSpeechEnded();
132+
}
133+
}
134+
}
135+
136+
/** Wipes state. Call on recorder start. No event fires. */
137+
void reset() {
138+
isSpeaking = false;
139+
firstLoudMs = -1;
140+
lastLoudMs = -1;
141+
}
142+
143+
/**
144+
* Call on recorder stop. If we were in {@code started}, force-fires
145+
* {@code onSpeechEnded} so JS doesn't get latched, then resets.
146+
*/
147+
void onRecordStop() {
148+
if (isSpeaking) {
149+
listener.onSpeechEnded();
150+
}
151+
reset();
152+
}
153+
}

android/src/main/java/com/oney/WebRTCModule/WebRTCModule.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public class WebRTCModule extends ReactContextBaseJavaModule {
5151
final Map<String, MediaStream> localStreams;
5252

5353
private final GetUserMediaImpl getUserMediaImpl;
54+
private SpeechActivityDetector speechActivityDetector;
5455

5556
public WebRTCModule(ReactApplicationContext reactContext) {
5657
super(reactContext);
@@ -124,12 +125,32 @@ public WebRTCModule(ReactApplicationContext reactContext) {
124125
}
125126

126127
private JavaAudioDeviceModule createAudioDeviceModule(ReactApplicationContext reactContext) {
128+
speechActivityDetector = new SpeechActivityDetector(new SpeechActivityDetector.Listener() {
129+
@Override
130+
public void onSpeechStarted() {
131+
WritableMap params = Arguments.createMap();
132+
params.putString("event", "started");
133+
sendEvent("audioDeviceModuleSpeechActivity", params);
134+
}
135+
136+
@Override
137+
public void onSpeechEnded() {
138+
WritableMap params = Arguments.createMap();
139+
params.putString("event", "ended");
140+
sendEvent("audioDeviceModuleSpeechActivity", params);
141+
}
142+
});
143+
127144
return JavaAudioDeviceModule
128145
.builder(reactContext)
129146
.setUseHardwareAcousticEchoCanceler(Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q)
130147
.setUseHardwareNoiseSuppressor(Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q)
131148
.setUseStereoOutput(true)
132149
.setAudioBufferCallback((audioBuffer, audioFormat, channelCount, sampleRate, bytesRead, captureTimeNs) -> {
150+
// 1. Speech activity detection on raw mic data, BEFORE any mutation.
151+
speechActivityDetector.processBuffer(audioBuffer, bytesRead);
152+
153+
// 2. Existing screen-audio mixing — mutates audioBuffer in place.
133154
if (bytesRead > 0) {
134155
WebRTCModuleOptions.ScreenAudioBytesProvider provider =
135156
WebRTCModuleOptions.getInstance().screenAudioBytesProvider;
@@ -142,6 +163,17 @@ private JavaAudioDeviceModule createAudioDeviceModule(ReactApplicationContext re
142163
}
143164
return captureTimeNs;
144165
})
166+
.setAudioRecordStateCallback(new JavaAudioDeviceModule.AudioRecordStateCallback() {
167+
@Override
168+
public void onWebRtcAudioRecordStart() {
169+
speechActivityDetector.reset();
170+
}
171+
172+
@Override
173+
public void onWebRtcAudioRecordStop() {
174+
speechActivityDetector.onRecordStop();
175+
}
176+
})
145177
.createAudioDeviceModule();
146178
}
147179

examples/GumTestApp/ios/GumTestApp.xcodeproj/project.pbxproj

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,9 @@
759759
);
760760
MTL_ENABLE_DEBUG_INFO = YES;
761761
ONLY_ACTIVE_ARCH = YES;
762-
OTHER_LDFLAGS = "$(inherited) ";
762+
OTHER_LDFLAGS = (
763+
"$(inherited)",
764+
);
763765
REACT_NATIVE_PATH = "${PODS_ROOT}/../../node_modules/react-native";
764766
SDKROOT = iphoneos;
765767
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "$(inherited) DEBUG";
@@ -816,7 +818,9 @@
816818
"\"$(inherited)\"",
817819
);
818820
MTL_ENABLE_DEBUG_INFO = NO;
819-
OTHER_LDFLAGS = "$(inherited) ";
821+
OTHER_LDFLAGS = (
822+
"$(inherited)",
823+
);
820824
REACT_NATIVE_PATH = "${PODS_ROOT}/../../node_modules/react-native";
821825
SDKROOT = iphoneos;
822826
USE_HERMES = true;
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#import "WebRTCModule.h"
2+
3+
@interface WebRTCModule (RTCAudioDeviceModule)
4+
5+
@end

0 commit comments

Comments
 (0)