update google tts options (#104)

xquanluu · web-flow · commit 51af3459ec71 · 2026-01-23T08:59:08.000-05:00
* update google tts options

* wip
diff --git a/fern/docs/pages/features/tts-streaming.mdx b/fern/docs/pages/features/tts-streaming.mdx
@@ -26,6 +26,7 @@ Finally, as of release 0.9.3 we support the following TTS vendors for streaming:
 - Elevenlabs
 - Cartesia
 - Rimelabs
+- Google
 
 We are adding additional vendors all the time, so check back with us if you are looking for support from a different vendor.
 
diff --git a/fern/docs/pages/features/using-openai-stt.mdx b/fern/docs/pages/features/using-openai-stt.mdx
@@ -28,8 +28,9 @@ To begin with, here are the possible options that you use with OpenAI STT:
       prompt: 'string',
       turn_detection: {
         type: 'server_vad', // or 'semantic_vad' or 'none'
-        prefix_padding_ms: 300,
-        silence_duration_ms: 800
+        eagerness: 'medium', // only for semantic_vad: 'low', 'medium', 'high', or 'auto'
+        prefix_padding_ms: 300, // only for server_vad
+        silence_duration_ms: 800 // only for server_vad
       },
       promptTemplates: {
         hintsTemplate: 'string',
@@ -39,6 +40,19 @@ To begin with, here are the possible options that you use with OpenAI STT:
   }
 ```
 
+### Turn detection options
+
+The `turn_detection` object controls how OpenAI detects when a speaker has finished talking.
+
+**For `semantic_vad` type:**
+- `eagerness`: Controls how eager the model is to determine the end of an utterance. Possible values:
+  - `auto` (default): Equivalent to `medium`
+  - `low`: Allows the user more time to speak, resulting in larger transcript chunks
+  - `medium`: Balanced approach
+  - `high`: Returns transcription events faster with smaller chunks
+
+The `eagerness` setting affects how audio is chunked even in transcription mode. Use `high` if you want faster transcription events, or `low` if you prefer larger, more complete transcript chunks.
+
 In this article we want to explore the various ways to construct a prompt for OpenAI STT.  
 
 ## Providing hints
diff --git a/fern/docs/pages/verbs/recognizer.mdx b/fern/docs/pages/verbs/recognizer.mdx
@@ -1108,6 +1108,14 @@ subtitle: A **property** that can be used in verbs like [`gather`](./gather) and
     <ParamField path="transcription_config.punctuation_overrides.sensitivity" type="number" required={false}>
     </ParamField>
 
+    <ParamField path="transcription_config.conversation_config" type="object" required={false}>
+      Configuration for conversation-based transcription features.
+    </ParamField>
+
+    <ParamField path="transcription_config.conversation_config.end_of_utterance_silence_trigger" type="number" required={false}>
+      Duration of silence (in seconds) that triggers an end-of-utterance event. This controls how long the system waits after the speaker stops talking before determining that the utterance is complete. See [Speechmatics turn detection docs](https://docs.speechmatics.com/speech-to-text/realtime/turn-detection#end-of-utterance) for details.
+    </ParamField>
+
     <ParamField path="sm_audioFilteringConfig" type="object" required={false}>
       Audio filtering configuration.
     </ParamField>
diff --git a/fern/docs/pages/verbs/say.mdx b/fern/docs/pages/verbs/say.mdx
@@ -45,11 +45,15 @@ subtitle: Generate text-to-speech audio.
 </ParamField>
 
 <ParamField path="synthesizer.voice" type="string" required={false}>
-  Voice to use.  
-  Note that the voice list differs depending on whether you are using AWS or Google.  
+  Voice to use.
+  Note that the voice list differs depending on whether you are using AWS or Google.
   Defaults to application setting, if provided.
 </ParamField>
 
+<ParamField path="instructions" type="string" required={false}>
+  A prompt sent to the TTS vendor to guide how the audio should be generated. Use this to specify the desired tone, emotion, speaking style, or context for the synthesized speech. This parameter is only supported by vendors that offer prompt-based TTS generation (e.g., Google Gemini TTS).
+</ParamField>
+
 <ParamField path="text" type="string" required={false}>
   Text to speak; may contain SSML tags.
 </ParamField>
diff --git a/fern/docs/pages/verbs/synthesizer.mdx b/fern/docs/pages/verbs/synthesizer.mdx
@@ -37,6 +37,23 @@ subtitle: A **property** that can be used in a `say` verb to override the applic
 
 <AccordionGroup>
 
+  <Accordion title="google">
+    <ParamField path="model" type="string" required={false}>
+      The model to use for text-to-speech synthesis. When specified, this enables Gemini TTS. Example: `gemini-2.5-flash-preview-tts`.
+    </ParamField>
+    <ParamField path="apiMode" type="string" required={false}>
+      Controls which Google TTS API mode to use. Possible values:
+      - `tts`: Standard Google Cloud TTS voices (default).
+      - `live`: HD voices using streaming mode for higher quality output.
+      - `gemini`: Gemini TTS for AI-powered speech synthesis.
+
+      The mode is automatically selected based on configuration: if `options.model` is specified or a `model_id` is configured in the speech credentials, Gemini TTS is used. If an HD voice is selected, `live` mode is used. Otherwise, standard `tts` mode is used.
+    </ParamField>
+    <ParamField path="prompt" type="string" required={false}>
+      A prompt sent to the TTS model to guide how the audio should be generated. Use this to specify the desired tone, emotion, speaking style, or context for the synthesized speech. This parameter is only applicable when using Gemini TTS (`apiMode: "gemini"` or when `model` is specified).
+    </ParamField>
+  </Accordion>
+
   <Accordion title="cartesia">
     <ParamField path="voice_mode" type="string" required={false}>
       `embedding` or `id` (see [Cartesia docs](https://docs.cartesia.ai/api-reference/tts/bytes#request.body.voice))