Skip to content

Commit 0da39da

Browse files
aelisseecopybara-github
authored andcommitted
Introduces a VAD processor to detect start and end of speech.
This is handy when using LLMs that accept audio parts. Vad processors can be used as pre-processors before the model is called to collect the audio spoken by the user. See vad_cli.py for an example of usage. PiperOrigin-RevId: 890866721
1 parent a212ec0 commit 0da39da

9 files changed

Lines changed: 1037 additions & 36 deletions

File tree

examples/README.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,18 @@ We recommend checking the following CLI examples first:
3434

3535
Sub-directories include more complex agents like
3636
[Research](https://github.com/google-gemini/genai-processors/blob/main/examples/research/README.md)
37-
(deep research agent) or
37+
(deep research agent),
3838
[Commentator](https://github.com/google-gemini/genai-processors/blob/main/examples/live_commentator/README.md)
39-
(live commentator on a video feed including an interruption mechanism). Check
40-
the README files in these subdirectories to get an in-depth description of how
41-
they work and how they were built.
39+
(live commentator on a video feed including an interruption mechanism), or
40+
[Live Illustrator](https://github.com/google-gemini/genai-processors/blob/main/examples/live_illustrator/README.md)
41+
(continuously listens to audio and generates accompanying images triggered by speech).
42+
Check the README files in these subdirectories to get an in-depth
43+
description of how they work and how they were built.
4244

4345
Other CLIs like
44-
[speech_to_text_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/speech_to_text_cli.py)
46+
[speech_to_text_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/speech_to_text_cli.py),
47+
[text_to_speech_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/text_to_speech_cli.py),
4548
or
46-
[text_to_speech_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/text_to_speech_cli.py)
47-
are simple wrappers around existing processor and can be used to check that your
48-
environment is set up correctly, e.g. to use the Google Speech API.
49+
[vad_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/vad_cli.py)
50+
are simple wrappers around an existing processor and can be used to check that your
51+
environment is set up correctly, e.g. to use the Google Speech API or local VAD.

examples/vad_cli.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Copyright 2026 DeepMind Technologies Limited. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ==============================================================================
15+
r"""Command Line Interface to test Voice Activity Detection + GenAI Model.
16+
17+
## Setup
18+
19+
To install the dependencies for this script, run:
20+
21+
```
22+
pip install genai-processors pyaudio
23+
```
24+
25+
Before running this script, ensure the `GOOGLE_API_KEY` environment
26+
variable is set to your Gemini API key.
27+
28+
## Run
29+
30+
To run the script:
31+
32+
```shell
33+
python3 ./vad_cli.py
34+
```
35+
"""
36+
37+
import asyncio
38+
import os
39+
import time
40+
41+
from genai_processors import content_api
42+
from genai_processors import mime_types
43+
from genai_processors.core import audio
44+
from genai_processors.core import audio_io
45+
from genai_processors.core import genai_model
46+
from genai_processors.core import realtime
47+
from genai_processors.core import speech_events
48+
from genai_processors.core import text
49+
from genai_processors.core import vad
50+
import pyaudio
51+
52+
53+
# You need to define the api key in the environment variables.
54+
# export GOOGLE_API_KEY=...
55+
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
56+
57+
58+
async def run_vad_cli() -> None:
59+
"""Runs the VAD + Gemini model in a CLI environment."""
60+
61+
pya = pyaudio.PyAudio()
62+
63+
# The base model that we want to query.
64+
base_model = genai_model.GenaiModel(
65+
api_key=GOOGLE_API_KEY,
66+
model_name='gemini-2.5-flash',
67+
)
68+
69+
# The LiveProcessor manages conversation turns. It accumulates all the
70+
# content until EndOfSpeech (emitted by vad.Vad()) and then sends the
71+
# accumulated audio (which AudioToWav converts into a single .wav part)
72+
# to the GenAI model.
73+
live_processor = realtime.LiveProcessor(
74+
turn_processor=(audio.AudioToWav() + base_model),
75+
trigger_model_mode=realtime.AudioTriggerMode.END_OF_SPEECH,
76+
)
77+
78+
pipeline = (
79+
audio_io.PyAudioIn(pya)
80+
# Aggressive VAD to detect speech activity (less false positives). User
81+
# needs to speak loud and clear.
82+
+ vad.Vad()
83+
# Adds status parts to provide feedback to the user.
84+
+ add_speech_event_status
85+
+ live_processor
86+
)
87+
88+
print(
89+
f'{time.perf_counter():.2f} - VAD + Model pipeline ready: start talking!'
90+
)
91+
try:
92+
await text.terminal_output(pipeline(text.terminal_input()))
93+
finally:
94+
pya.terminate()
95+
96+
97+
if __name__ == '__main__':
98+
asyncio.run(run_vad_cli())

genai_processors/core/realtime.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from genai_processors import mime_types
3838
from genai_processors import processor
3939
from genai_processors import streams
40-
from genai_processors.core import speech_to_text
40+
from genai_processors.core import speech_events
4141
from genai_processors.core import window
4242

4343

@@ -49,7 +49,7 @@
4949
CONVERSATION_START = '\nThe following is your conversation so far:\n'
5050

5151
# Substream name to output part directly as is without going through the model.
52-
DIRECT_OUTPUT_SUBSTREAM = speech_to_text.TRANSCRIPTION_SUBSTREAM_NAME
52+
DIRECT_OUTPUT_SUBSTREAM = speech_events.TRANSCRIPTION_SUBSTREAM_NAME
5353
# Metadata key that should be set to True if the part that is output directly
5454
# as text (see DIRECT_OUTPUT_TEXT) should also be in the prompt.
5555
DIRECT_OUTPUT_IN_PROMPT = 'is_final'
@@ -164,11 +164,11 @@ async def _conversation_loop(
164164
if self._trigger_model_mode == AudioTriggerMode.FINAL_TRANSCRIPTION:
165165
await conversation_model.cancel()
166166
await conversation_model.turn()
167-
elif mime_types.is_dataclass(part.mimetype, speech_to_text.StartOfSpeech):
167+
elif speech_events.is_start_of_speech(part):
168168
# User starts talking.
169169
user_not_talking.clear()
170170
await conversation_model.cancel()
171-
elif mime_types.is_dataclass(part.mimetype, speech_to_text.EndOfSpeech):
171+
elif speech_events.is_end_of_speech(part):
172172
# User is done talking, a conversation turn is requested.
173173
user_not_talking.set()
174174
if self._trigger_model_mode == AudioTriggerMode.END_OF_SPEECH:
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright 2026 DeepMind Technologies Limited. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ==============================================================================
15+
"""Shared speech event dataclasses.
16+
17+
These dataclasses represent speech activity transitions and are used by
18+
multiple processors (e.g. ``Vad``, ``SpeechToText``) to emit structured
19+
events via ``ProcessorPart.from_dataclass()``.
20+
21+
Downstream processors like ``LiveProcessor`` detect these events using
22+
``mime_types.is_dataclass(part.mimetype, StartOfSpeech)`` etc.
23+
"""
24+
25+
import dataclasses
26+
27+
import dataclasses_json
28+
from genai_processors import content_api
29+
30+
TRANSCRIPTION_SUBSTREAM_NAME = 'input_transcription'
31+
ENDPOINTING_SUBSTREAM_NAME = 'input_endpointing'
32+
33+
@dataclasses_json.dataclass_json
34+
@dataclasses.dataclass(frozen=True)
35+
class StartOfSpeech:
36+
"""Start of speech event.
37+
38+
Emitted when the processor detects that the user has started speaking.
39+
"""
40+
41+
pass
42+
43+
44+
@dataclasses_json.dataclass_json
45+
@dataclasses.dataclass(frozen=True)
46+
class EndOfSpeech:
47+
"""End of speech event.
48+
49+
Emitted when the processor detects that the user has stopped speaking.
50+
"""
51+
52+
pass
53+
54+
55+
def is_start_of_speech(part: content_api.ProcessorPart) -> bool:
56+
"""Returns True if the part is a StartOfSpeech event."""
57+
from genai_processors import mime_types
58+
59+
return mime_types.is_dataclass(part.mimetype, StartOfSpeech)
60+
61+
62+
def is_end_of_speech(part: content_api.ProcessorPart) -> bool:
63+
"""Returns True if the part is an EndOfSpeech event."""
64+
from genai_processors import mime_types
65+
66+
return mime_types.is_dataclass(part.mimetype, EndOfSpeech)

genai_processors/core/speech_to_text.py

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,13 @@
2929

3030
import asyncio
3131
from collections.abc import AsyncIterable
32-
import dataclasses
3332
import time
3433

3534
from absl import logging
36-
import dataclasses_json
3735
from genai_processors import content_api
3836
from genai_processors import processor
3937
from genai_processors import streams
38+
from genai_processors.core import speech_events
4039
from google.cloud import speech_v2
4140
import grpc
4241

@@ -63,24 +62,12 @@
6362
ProcessorPart = content_api.ProcessorPart
6463
ProcessorPartTypes = content_api.ProcessorPartTypes
6564

66-
TRANSCRIPTION_SUBSTREAM_NAME = 'input_transcription'
67-
ENDPOINTING_SUBSTREAM_NAME = 'input_endpointing'
68-
69-
70-
@dataclasses_json.dataclass_json
71-
@dataclasses.dataclass(frozen=True)
72-
class StartOfSpeech:
73-
"""Start of speech event."""
74-
75-
pass
76-
77-
78-
@dataclasses_json.dataclass_json
79-
@dataclasses.dataclass(frozen=True)
80-
class EndOfSpeech:
81-
"""End of speech event."""
82-
83-
pass
65+
# Re-export from speech_events for backward compatibility.
66+
# Use speech_events variants instead.
67+
StartOfSpeech = speech_events.StartOfSpeech
68+
EndOfSpeech = speech_events.EndOfSpeech
69+
TRANSCRIPTION_SUBSTREAM_NAME = speech_events.TRANSCRIPTION_SUBSTREAM_NAME
70+
ENDPOINTING_SUBSTREAM_NAME = speech_events.ENDPOINTING_SUBSTREAM_NAME
8471

8572

8673
class AddSilentPartMaybe(processor.Processor):
@@ -139,10 +126,10 @@ def __init__(
139126
project_id: str,
140127
recognition_config: speech_v2.types.RecognitionConfig,
141128
with_endpointing: bool = True,
142-
substream_endpointing: str = ENDPOINTING_SUBSTREAM_NAME,
129+
substream_endpointing: str = speech_events.ENDPOINTING_SUBSTREAM_NAME,
143130
strict_endpointing: bool = True,
144131
with_interim_results: bool = True,
145-
substream_transcription: str = TRANSCRIPTION_SUBSTREAM_NAME,
132+
substream_transcription: str = speech_events.TRANSCRIPTION_SUBSTREAM_NAME,
146133
passthrough_audio: bool = False,
147134
):
148135
"""Transcribes audio parts using the Cloud Speech API.
@@ -350,10 +337,10 @@ def __init__(
350337
recognition_config: speech_v2.types.RecognitionConfig | None = None,
351338
audio_passthrough: bool = False,
352339
with_endpointing: bool = True,
353-
substream_endpointing: str = ENDPOINTING_SUBSTREAM_NAME,
340+
substream_endpointing: str = speech_events.ENDPOINTING_SUBSTREAM_NAME,
354341
strict_endpointing: bool = True,
355342
with_interim_results: bool = True,
356-
substream_transcription: str = TRANSCRIPTION_SUBSTREAM_NAME,
343+
substream_transcription: str = speech_events.TRANSCRIPTION_SUBSTREAM_NAME,
357344
maintain_connection_active_with_silent_audio: bool = False,
358345
):
359346
"""Initializes the SpeechToText processor.

0 commit comments

Comments
 (0)