@@ -3,6 +3,67 @@ import { TinyEmitter } from "@project_neko/common";
33import { SpeechInterruptController } from "../protocol" ;
44import type { AudioService , AudioServiceEvents , AudioServiceState , NekoWsIncomingJson , RealtimeClientLike } from "../types" ;
55
6+ /** 简单 VAD:基于 RMS 振幅 + 防抖,适用于 React Native */
7+ function calcRMS ( int16 : Int16Array ) : number {
8+ let sum = 0 ;
9+ for ( let i = 0 ; i < int16 . length ; i ++ ) sum += ( int16 [ i ] / 32768 ) ** 2 ;
10+ return Math . sqrt ( sum / int16 . length ) ;
11+ }
12+
13+ interface SimpleVADState {
14+ isSpeaking : boolean ;
15+ consecutiveSpeechFrames : number ;
16+ consecutiveSilenceFrames : number ;
17+ }
18+
19+ function createSimpleVAD ( opts : {
20+ speechThreshold ?: number ; // RMS 超过此值认为有语音,默认 0.02
21+ silenceThreshold ?: number ; // RMS 低于此值认为静音,默认 0.01
22+ minSpeechFrames ?: number ; // 连续多少帧才确认语音开始,默认 2
23+ silenceFrames ?: number ; // 连续多少帧静音才确认语音结束,默认 8
24+ onSpeechStart ?: ( ) => void ;
25+ onSpeechEnd ?: ( ) => void ;
26+ } ) {
27+ const speechThreshold = opts . speechThreshold ?? 0.02 ;
28+ const silenceThreshold = opts . silenceThreshold ?? 0.01 ;
29+ const minSpeechFrames = opts . minSpeechFrames ?? 2 ;
30+ const silenceFrames = opts . silenceFrames ?? 8 ;
31+
32+ const state : SimpleVADState = {
33+ isSpeaking : false ,
34+ consecutiveSpeechFrames : 0 ,
35+ consecutiveSilenceFrames : 0 ,
36+ } ;
37+
38+ function processFrame ( int16 : Int16Array ) : boolean {
39+ const rms = calcRMS ( int16 ) ;
40+ if ( rms >= speechThreshold ) {
41+ state . consecutiveSpeechFrames ++ ;
42+ state . consecutiveSilenceFrames = 0 ;
43+ if ( ! state . isSpeaking && state . consecutiveSpeechFrames >= minSpeechFrames ) {
44+ state . isSpeaking = true ;
45+ opts . onSpeechStart ?.( ) ;
46+ }
47+ } else if ( rms < silenceThreshold ) {
48+ state . consecutiveSilenceFrames ++ ;
49+ state . consecutiveSpeechFrames = 0 ;
50+ if ( state . isSpeaking && state . consecutiveSilenceFrames >= silenceFrames ) {
51+ state . isSpeaking = false ;
52+ opts . onSpeechEnd ?.( ) ;
53+ }
54+ }
55+ return state . isSpeaking ;
56+ }
57+
58+ function reset ( ) {
59+ state . isSpeaking = false ;
60+ state . consecutiveSpeechFrames = 0 ;
61+ state . consecutiveSilenceFrames = 0 ;
62+ }
63+
64+ return { processFrame, reset, getState : ( ) => ( { ...state } ) } ;
65+ }
66+
667function withTimeout < T > ( p : Promise < T > , ms : number , message : string ) : Promise < T > {
768 if ( ! ms || ms <= 0 ) return p ;
869 let timer : any = null ;
@@ -30,6 +91,17 @@ export function createNativeAudioService(args: {
3091 const emitter = new TinyEmitter < AudioServiceEvents > ( ) ;
3192 const interrupt = new SpeechInterruptController ( ) ;
3293
94+ // 当前是否正在播放(用于 VAD 门控)
95+ let isPlaying = false ;
96+
97+ // 客户端 VAD:过滤回声,只有真正的人声才发给服务器
98+ const vad = createSimpleVAD ( {
99+ speechThreshold : 0.08 ,
100+ silenceThreshold : 0.06 ,
101+ minSpeechFrames : 2 ,
102+ silenceFrames : 8 ,
103+ } ) ;
104+
33105 let state : AudioServiceState = "idle" ;
34106 const setState = ( next : AudioServiceState ) => {
35107 if ( state === next ) return ;
@@ -59,15 +131,15 @@ export function createNativeAudioService(args: {
59131 if ( ampSub ) return ;
60132
61133 ampSub = PCMStream . addListener ( "onAmplitudeUpdate" , ( event : any ) => {
62- // 打断后屏蔽一段时间,防止缓冲区余音让 isPlaying 保持 true
63134 if ( Date . now ( ) < outputAmpMutedUntil ) return ;
64135 const amp = typeof event ?. amplitude === "number" ? event . amplitude : 0 ;
65- // onAmplitudeUpdate 来自 PCMStreamPlayer(播放振幅),应映射为 outputAmplitude
136+ isPlaying = amp > 0.01 ;
66137 emitter . emit ( "outputAmplitude" , { amplitude : Math . max ( 0 , Math . min ( 1 , amp ) ) } ) ;
67138 } ) ;
68139
69140 playbackStopSub = PCMStream . addListener ( "onPlaybackStop" , ( ) => {
70- // 播放完成:输出 0,方便口型收嘴
141+ isPlaying = false ;
142+ vad . reset ( ) ;
71143 emitter . emit ( "outputAmplitude" , { amplitude : 0 } ) ;
72144 } ) ;
73145 } ;
@@ -91,12 +163,16 @@ export function createNativeAudioService(args: {
91163 const pcm : Uint8Array | undefined = event ?. pcm ;
92164 if ( ! pcm ) return ;
93165
94- // 打断后静音期内丢弃麦克风数据,防止扬声器尾音被当作用户输入
166+ // 打断后静音期内丢弃麦克风数据
95167 if ( Date . now ( ) < micMutedUntil ) return ;
96168
97- // 与旧版协议一致:stream_data + input_type=audio + data 为 number[]
169+ const int16 = new Int16Array ( pcm . buffer . slice ( pcm . byteOffset , pcm . byteOffset + pcm . byteLength ) ) ;
170+
171+ // 客户端 VAD 门控:播放时只有检测到真实人声才发送,过滤回声
172+ const isSpeaking = vad . processFrame ( int16 ) ;
173+ if ( isPlaying && ! isSpeaking ) return ;
174+
98175 try {
99- const int16 = new Int16Array ( pcm . buffer . slice ( pcm . byteOffset , pcm . byteOffset + pcm . byteLength ) ) ;
100176 args . client . sendJson ( {
101177 action : "stream_data" ,
102178 data : Array . from ( int16 as any ) ,
@@ -302,11 +378,10 @@ export function createNativeAudioService(args: {
302378 try {
303379 PCMStream . stopPlayback ( ) ;
304380 } catch ( _e ) { }
305- // 手动打断:丢弃后续飞来的 binary 音频帧,直到新一轮 audio_chunk 到来
381+ isPlaying = false ;
382+ vad . reset ( ) ;
306383 manualInterruptActive = true ;
307- // 打断后静音麦克风一段时间,避免扬声器尾音被录入
308384 micMutedUntil = Date . now ( ) + MIC_MUTE_AFTER_INTERRUPT_MS ;
309- // 打断后屏蔽播放振幅事件,避免缓冲区余音让按钮消不掉
310385 outputAmpMutedUntil = Date . now ( ) + OUTPUT_AMP_MUTE_AFTER_INTERRUPT_MS ;
311386 emitter . emit ( "outputAmplitude" , { amplitude : 0 } ) ;
312387 } ;
0 commit comments