google-gemini
diff --git a/‎examples/README.md‎
Lines changed: 11 additions & 8 deletions b/‎examples/README.md‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎examples/vad_cli.py‎
Lines changed: 98 additions & 0 deletions b/‎examples/vad_cli.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎genai_processors/core/realtime.py‎
Lines changed: 4 additions & 4 deletions b/‎genai_processors/core/realtime.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎genai_processors/core/speech_events.py‎
Lines changed: 66 additions & 0 deletions b/‎genai_processors/core/speech_events.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎genai_processors/core/speech_to_text.py‎
Lines changed: 11 additions & 24 deletions b/‎genai_processors/core/speech_to_text.py‎
Lines changed: 11 additions & 24 deletions
@@ -34,15 +34,18 @@ We recommend checking the following CLI examples first:
 
 Sub-directories include more complex agents like
 [Research](https://github.com/google-gemini/genai-processors/blob/main/examples/research/README.md)
-(deep research agent) or
+(deep research agent),
 [Commentator](https://github.com/google-gemini/genai-processors/blob/main/examples/live_commentator/README.md)
-(live commentator on a video feed including an interruption mechanism). Check
-the README files in these subdirectories to get an in-depth description of how
-they work and how they were built.
+(live commentator on a video feed including an interruption mechanism), or
+[Live Illustrator](https://github.com/google-gemini/genai-processors/blob/main/examples/live_illustrator/README.md)
+(continuously listens to audio and generates accompanying images triggered by speech).
+Check the README files in these subdirectories to get an in-depth
+description of how they work and how they were built.
 
 Other CLIs like
-[speech_to_text_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/speech_to_text_cli.py)
+[speech_to_text_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/speech_to_text_cli.py),
+[text_to_speech_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/text_to_speech_cli.py),
 or
-[text_to_speech_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/text_to_speech_cli.py)
-are simple wrappers around existing processor and can be used to check that your
-environment is set up correctly, e.g. to use the Google Speech API.
+[vad_cli](https://github.com/google-gemini/genai-processors/blob/main/examples/vad_cli.py)
+are simple wrappers around an existing processor and can be used to check that your
+environment is set up correctly, e.g. to use the Google Speech API or local VAD.
@@ -0,0 +1,98 @@
+# Copyright 2026 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Command Line Interface to test Voice Activity Detection + GenAI Model.
+
+## Setup
+
+To install the dependencies for this script, run:
+
+```
+pip install genai-processors pyaudio
+```
+
+Before running this script, ensure the `GOOGLE_API_KEY` environment
+variable is set to your Gemini API key.
+
+## Run
+
+To run the script:
+
+```shell
+python3 ./vad_cli.py
+```
+"""
+
+import asyncio
+import os
+import time
+
+from genai_processors import content_api
+from genai_processors import mime_types
+from genai_processors.core import audio
+from genai_processors.core import audio_io
+from genai_processors.core import genai_model
+from genai_processors.core import realtime
+from genai_processors.core import speech_events
+from genai_processors.core import text
+from genai_processors.core import vad
+import pyaudio
+
+
+# You need to define the api key in the environment variables.
+# export GOOGLE_API_KEY=...
+GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
+
+
+async def run_vad_cli() -> None:
+  """Runs the VAD + Gemini model in a CLI environment."""
+
+  pya = pyaudio.PyAudio()
+
+  # The base model that we want to query.
+  base_model = genai_model.GenaiModel(
+      api_key=GOOGLE_API_KEY,
+      model_name='gemini-2.5-flash',
+  )
+
+  # The LiveProcessor manages conversation turns. It accumulates all the
+  # content until EndOfSpeech (emitted by vad.Vad()) and then sends the
+  # accumulated audio (which AudioToWav converts into a single .wav part)
+  # to the GenAI model.
+  live_processor = realtime.LiveProcessor(
+      turn_processor=(audio.AudioToWav() + base_model),
+      trigger_model_mode=realtime.AudioTriggerMode.END_OF_SPEECH,
+  )
+
+  pipeline = (
+      audio_io.PyAudioIn(pya)
+      # Aggressive VAD to detect speech activity (less false positives). User
+      # needs to speak loud and clear.
+      + vad.Vad()
+      # Adds status parts to provide feedback to the user.
+      + add_speech_event_status
+      + live_processor
+  )
+
+  print(
+      f'{time.perf_counter():.2f} - VAD + Model pipeline ready: start talking!'
+  )
+  try:
+    await text.terminal_output(pipeline(text.terminal_input()))
+  finally:
+    pya.terminate()
+
+
+if __name__ == '__main__':
+  asyncio.run(run_vad_cli())
@@ -37,7 +37,7 @@
 from genai_processors import mime_types
 from genai_processors import processor
 from genai_processors import streams
-from genai_processors.core import speech_to_text
+from genai_processors.core import speech_events
 from genai_processors.core import window
 
 
@@ -49,7 +49,7 @@
 CONVERSATION_START = '\nThe following is your conversation so far:\n'
 
 # Substream name to output part directly as is without going through the model.
-DIRECT_OUTPUT_SUBSTREAM = speech_to_text.TRANSCRIPTION_SUBSTREAM_NAME
+DIRECT_OUTPUT_SUBSTREAM = speech_events.TRANSCRIPTION_SUBSTREAM_NAME
 # Metadata key that should be set to True if the part that is output directly
 # as text (see DIRECT_OUTPUT_TEXT) should also be in the prompt.
 DIRECT_OUTPUT_IN_PROMPT = 'is_final'
@@ -164,11 +164,11 @@ async def _conversation_loop(
           if self._trigger_model_mode == AudioTriggerMode.FINAL_TRANSCRIPTION:
             await conversation_model.cancel()
             await conversation_model.turn()
-      elif mime_types.is_dataclass(part.mimetype, speech_to_text.StartOfSpeech):
+      elif speech_events.is_start_of_speech(part):
         # User starts talking.
         user_not_talking.clear()
         await conversation_model.cancel()
-      elif mime_types.is_dataclass(part.mimetype, speech_to_text.EndOfSpeech):
+      elif speech_events.is_end_of_speech(part):
         # User is done talking, a conversation turn is requested.
         user_not_talking.set()
         if self._trigger_model_mode == AudioTriggerMode.END_OF_SPEECH:
 
@@ -0,0 +1,66 @@
+# Copyright 2026 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Shared speech event dataclasses.
+
+These dataclasses represent speech activity transitions and are used by
+multiple processors (e.g. ``Vad``, ``SpeechToText``) to emit structured
+events via ``ProcessorPart.from_dataclass()``.
+
+Downstream processors like ``LiveProcessor`` detect these events using
+``mime_types.is_dataclass(part.mimetype, StartOfSpeech)`` etc.
+"""
+
+import dataclasses
+
+import dataclasses_json
+from genai_processors import content_api
+
+TRANSCRIPTION_SUBSTREAM_NAME = 'input_transcription'
+ENDPOINTING_SUBSTREAM_NAME = 'input_endpointing'
+
+@dataclasses_json.dataclass_json
+@dataclasses.dataclass(frozen=True)
+class StartOfSpeech:
+  """Start of speech event.
+
+  Emitted when the processor detects that the user has started speaking.
+  """
+
+  pass
+
+
+@dataclasses_json.dataclass_json
+@dataclasses.dataclass(frozen=True)
+class EndOfSpeech:
+  """End of speech event.
+
+  Emitted when the processor detects that the user has stopped speaking.
+  """
+
+  pass
+
+
+def is_start_of_speech(part: content_api.ProcessorPart) -> bool:
+  """Returns True if the part is a StartOfSpeech event."""
+  from genai_processors import mime_types
+
+  return mime_types.is_dataclass(part.mimetype, StartOfSpeech)
+
+
+def is_end_of_speech(part: content_api.ProcessorPart) -> bool:
+  """Returns True if the part is an EndOfSpeech event."""
+  from genai_processors import mime_types
+
+  return mime_types.is_dataclass(part.mimetype, EndOfSpeech)
@@ -29,14 +29,13 @@
 
 import asyncio
 from collections.abc import AsyncIterable
-import dataclasses
 import time
 
 from absl import logging
-import dataclasses_json
 from genai_processors import content_api
 from genai_processors import processor
 from genai_processors import streams
+from genai_processors.core import speech_events
 from google.cloud import speech_v2
 import grpc
 
@@ -63,24 +62,12 @@
 ProcessorPart = content_api.ProcessorPart
 ProcessorPartTypes = content_api.ProcessorPartTypes
 
-TRANSCRIPTION_SUBSTREAM_NAME = 'input_transcription'
-ENDPOINTING_SUBSTREAM_NAME = 'input_endpointing'
-
-
-@dataclasses_json.dataclass_json
-@dataclasses.dataclass(frozen=True)
-class StartOfSpeech:
-  """Start of speech event."""
-
-  pass
-
-
-@dataclasses_json.dataclass_json
-@dataclasses.dataclass(frozen=True)
-class EndOfSpeech:
-  """End of speech event."""
-
-  pass
+# Re-export from speech_events for backward compatibility.
+# Use speech_events variants instead.
+StartOfSpeech = speech_events.StartOfSpeech
+EndOfSpeech = speech_events.EndOfSpeech
+TRANSCRIPTION_SUBSTREAM_NAME = speech_events.TRANSCRIPTION_SUBSTREAM_NAME
+ENDPOINTING_SUBSTREAM_NAME = speech_events.ENDPOINTING_SUBSTREAM_NAME
 
 
 class AddSilentPartMaybe(processor.Processor):
@@ -139,10 +126,10 @@ def __init__(
       project_id: str,
       recognition_config: speech_v2.types.RecognitionConfig,
       with_endpointing: bool = True,
-      substream_endpointing: str = ENDPOINTING_SUBSTREAM_NAME,
+      substream_endpointing: str = speech_events.ENDPOINTING_SUBSTREAM_NAME,
       strict_endpointing: bool = True,
       with_interim_results: bool = True,
-      substream_transcription: str = TRANSCRIPTION_SUBSTREAM_NAME,
+      substream_transcription: str = speech_events.TRANSCRIPTION_SUBSTREAM_NAME,
       passthrough_audio: bool = False,
   ):
     """Transcribes audio parts using the Cloud Speech API.
@@ -350,10 +337,10 @@ def __init__(
       recognition_config: speech_v2.types.RecognitionConfig | None = None,
       audio_passthrough: bool = False,
       with_endpointing: bool = True,
-      substream_endpointing: str = ENDPOINTING_SUBSTREAM_NAME,
+      substream_endpointing: str = speech_events.ENDPOINTING_SUBSTREAM_NAME,
       strict_endpointing: bool = True,
       with_interim_results: bool = True,
-      substream_transcription: str = TRANSCRIPTION_SUBSTREAM_NAME,
+      substream_transcription: str = speech_events.TRANSCRIPTION_SUBSTREAM_NAME,
       maintain_connection_active_with_silent_audio: bool = False,
   ):
     """Initializes the SpeechToText processor.