Skip to content

Commit 5681734

Browse files
milaGGLryanwilsondaymxnncooke3
authored
[AI] Support SpeechConfig in Text-To-Speech (#16226)
Co-authored-by: Ryan Wilson <wilsonryan@google.com> Co-authored-by: Daymon <daymxn@google.com> Co-authored-by: Nick Cooke <36927374+ncooke3@users.noreply.github.com> Co-authored-by: Daymon <17409137+daymxn@users.noreply.github.com> Co-authored-by: Nick Cooke <nickcooke@google.com>
1 parent e998845 commit 5681734

19 files changed

Lines changed: 606 additions & 37 deletions

.github/workflows/sdk.ai.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ jobs:
3434
with:
3535
target: ${{ matrix.target }}
3636
setup_command: scripts/update_vertexai_responses.sh
37+
env_vars: '{"FIREBASE_APP_CHECK_BRANCH": "main"}'
3738

3839
testapp-integration:
3940
if: (github.repository == 'Firebase/firebase-ios-sdk' && github.event_name == 'schedule') || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false)
@@ -69,6 +70,7 @@ jobs:
6970
TEST_RUNNER_VTXIntegrationImagen: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
7071
FIREBASECI_USE_LATEST_GOOGLEAPPMEASUREMENT: 1
7172
secrets_passphrase: ${{ secrets.GHASecretsGPGPassphrase1 }}
73+
FIREBASE_APP_CHECK_BRANCH: main
7274
steps:
7375
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
7476
- uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
@@ -108,6 +110,7 @@ jobs:
108110
runs-on: ${{ matrix.os }}
109111
env:
110112
BRANCH_NAME: ${{ github.head_ref || github.ref_name || 'main' }}
113+
FIREBASE_APP_CHECK_BRANCH: main
111114
steps:
112115
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
113116
- name: Xcode

FirebaseAI/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
simplify App Check setup. (#16185)
44
- [fixed] Fixed a namespace collision with the new
55
`FoundationModels.LanguageModelSession.Error` type introduced in Xcode 27 Beta. (#16252)
6+
- [feature] Added support for `SpeechConfig` in `GenerationConfig`, and `MultiSpeakerVoiceConfig`
7+
in `SpeechConfig`. (#16226)
68

79
# 12.14.0
810
- [fixed] Fixed an issue in `GenerativeModelSession` where `String` generation

FirebaseAI/Sources/GenerationConfig.swift

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ public struct GenerationConfig: Sendable, Equatable {
6161
/// Configuration options for generating images.
6262
var imageConfig: ImageConfig?
6363

64+
/// Configuration for controlling the voice of the model during conversation.
65+
var speechConfig: ProtoSpeechConfig?
66+
6467
/// Creates a new `GenerationConfig` value.
6568
///
6669
/// See the
@@ -164,13 +167,16 @@ public struct GenerationConfig: Sendable, Equatable {
164167
/// > backwards-incompatible ways.
165168
/// - thinkingConfig: Configuration for controlling the "thinking" behavior of compatible Gemini
166169
/// models; see ``ThinkingConfig`` for more details.
170+
/// - speechConfig: Configuration for controlling the voice of the model during conversation;
171+
/// see ``SpeechConfig`` for more details.
167172
/// - imageConfig: Configuration options for generating images.
168173
public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
169174
candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
170175
presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
171176
stopSequences: [String]? = nil, responseMIMEType: String? = nil,
172177
responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil,
173-
thinkingConfig: ThinkingConfig? = nil, imageConfig: ImageConfig? = nil) {
178+
thinkingConfig: ThinkingConfig? = nil, imageConfig: ImageConfig? = nil,
179+
speechConfig: SpeechConfig? = nil) {
174180
// Explicit init because otherwise if we re-arrange the above variables it changes the API
175181
// surface.
176182
self.temperature = temperature
@@ -187,13 +193,14 @@ public struct GenerationConfig: Sendable, Equatable {
187193
self.responseModalities = responseModalities
188194
self.thinkingConfig = thinkingConfig
189195
self.imageConfig = imageConfig
196+
self.speechConfig = speechConfig?.speechConfig
190197
}
191198

192199
init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil, candidateCount: Int? = nil,
193200
maxOutputTokens: Int? = nil, presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
194201
stopSequences: [String]? = nil, responseMIMEType: String, responseJSONSchema: JSONObject,
195202
responseModalities: [ResponseModality]? = nil, thinkingConfig: ThinkingConfig? = nil,
196-
imageConfig: ImageConfig? = nil) {
203+
imageConfig: ImageConfig? = nil, speechConfig: SpeechConfig? = nil) {
197204
self.temperature = temperature
198205
self.topP = topP
199206
self.topK = topK
@@ -207,6 +214,7 @@ public struct GenerationConfig: Sendable, Equatable {
207214
self.responseJSONSchema = responseJSONSchema
208215
self.responseModalities = responseModalities
209216
self.thinkingConfig = thinkingConfig
217+
self.speechConfig = speechConfig?.speechConfig
210218
self.imageConfig = imageConfig
211219
}
212220

@@ -246,6 +254,7 @@ public struct GenerationConfig: Sendable, Equatable {
246254
config.responseModalities = overrideConfig.responseModalities ?? config.responseModalities
247255
config.thinkingConfig = overrideConfig.thinkingConfig ?? config.thinkingConfig
248256
config.imageConfig = overrideConfig.imageConfig ?? config.imageConfig
257+
config.speechConfig = overrideConfig.speechConfig ?? config.speechConfig
249258

250259
// 5. Handle Schema mutual exclusivity with precedence for `responseJSONSchema`.
251260
if let responseJSONSchema = overrideConfig.responseJSONSchema {
@@ -278,5 +287,6 @@ extension GenerationConfig: Encodable {
278287
case responseModalities
279288
case thinkingConfig
280289
case imageConfig
290+
case speechConfig
281291
}
282292
}

FirebaseAI/Sources/Types/Internal/Live/BidiGenerationConfig.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@ struct BidiGenerationConfig: Encodable, Sendable {
2525
let presencePenalty: Float?
2626
let frequencyPenalty: Float?
2727
let responseModalities: [ResponseModality]?
28-
let speechConfig: BidiSpeechConfig?
28+
let speechConfig: ProtoSpeechConfig?
2929

3030
init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
3131
candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
3232
presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
3333
responseModalities: [ResponseModality]? = nil,
34-
speechConfig: BidiSpeechConfig? = nil) {
34+
speechConfig: ProtoSpeechConfig? = nil) {
3535
self.temperature = temperature
3636
self.topP = topP
3737
self.topK = topK
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Copyright 2026 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
struct ProtoMultiSpeakerVoiceConfig: Encodable, Sendable, Equatable {
18+
let speakerVoiceConfigs: [ProtoSpeakerVoiceConfig]
19+
20+
init(speakerVoiceConfigs: [ProtoSpeakerVoiceConfig]) {
21+
self.speakerVoiceConfigs = speakerVoiceConfigs
22+
}
23+
}

FirebaseAI/Sources/Types/Internal/Live/BidiSpeechConfig.swift renamed to FirebaseAI/Sources/Types/Internal/Shared/ProtoSpeakerVoiceConfig.swift

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2025 Google LLC
1+
// Copyright 2026 Google LLC
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -14,17 +14,12 @@
1414

1515
import Foundation
1616

17-
/// Speech generation config.
18-
@available(watchOS, unavailable)
19-
struct BidiSpeechConfig: Encodable, Sendable {
20-
/// The configuration for the speaker to use.
21-
let voiceConfig: VoiceConfig
17+
struct ProtoSpeakerVoiceConfig: Encodable, Sendable, Equatable {
18+
let speaker: String
19+
let voiceConfig: ProtoVoiceConfig
2220

23-
/// Language code (ISO 639. e.g. en-US) for the speech synthesization.
24-
let languageCode: String?
25-
26-
init(voiceConfig: VoiceConfig, languageCode: String?) {
21+
init(speaker: String, voiceConfig: ProtoVoiceConfig) {
22+
self.speaker = speaker
2723
self.voiceConfig = voiceConfig
28-
self.languageCode = languageCode
2924
}
3025
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright 2026 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Speech generation config.
18+
struct ProtoSpeechConfig: Encodable, Sendable, Equatable {
19+
/// The configuration for the speaker to use.
20+
let voiceConfig: ProtoVoiceConfig?
21+
22+
/// The configuration for the multi-speaker setup.
23+
let multiSpeakerVoiceConfig: ProtoMultiSpeakerVoiceConfig?
24+
25+
/// Language code (BCP-47. e.g. en-US) for the speech synthesization.
26+
let languageCode: String?
27+
28+
init(voiceConfig: ProtoVoiceConfig, languageCode: String?) {
29+
self.voiceConfig = voiceConfig
30+
multiSpeakerVoiceConfig = nil
31+
self.languageCode = languageCode
32+
}
33+
34+
init(multiSpeakerVoiceConfig: ProtoMultiSpeakerVoiceConfig, languageCode: String?) {
35+
voiceConfig = nil
36+
self.multiSpeakerVoiceConfig = multiSpeakerVoiceConfig
37+
self.languageCode = languageCode
38+
}
39+
}

FirebaseAI/Sources/Types/Internal/Live/VoiceConfig.swift renamed to FirebaseAI/Sources/Types/Internal/Shared/ProtoVoiceConfig.swift

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,19 @@
1515
import Foundation
1616

1717
/// Configuration for the speaker to use.
18-
@available(watchOS, unavailable)
19-
enum VoiceConfig {
18+
enum ProtoVoiceConfig: Sendable, Equatable {
2019
/// Configuration for the prebuilt voice to use.
21-
case prebuiltVoiceConfig(PrebuiltVoiceConfig)
20+
case prebuiltVoiceConfig(ProtoPrebuiltVoiceConfig)
2221

2322
/// Configuration for the custom voice to use.
24-
case customVoiceConfig(CustomVoiceConfig)
23+
case customVoiceConfig(ProtoCustomVoiceConfig)
2524
}
2625

2726
/// The configuration for the prebuilt speaker to use.
2827
///
2928
/// Not just a string on the parent proto, because there'll likely be a lot
3029
/// more options here.
31-
@available(watchOS, unavailable)
32-
struct PrebuiltVoiceConfig: Encodable, Sendable {
30+
struct ProtoPrebuiltVoiceConfig: Encodable, Sendable, Equatable {
3331
/// The name of the preset voice to use.
3432
let voiceName: String
3533

@@ -39,8 +37,7 @@ struct PrebuiltVoiceConfig: Encodable, Sendable {
3937
}
4038

4139
/// The configuration for the custom voice to use.
42-
@available(watchOS, unavailable)
43-
struct CustomVoiceConfig: Encodable, Sendable {
40+
struct ProtoCustomVoiceConfig: Encodable, Sendable, Equatable {
4441
/// The sample of the custom voice, in pcm16 s16e format.
4542
let customVoiceSample: Data
4643

@@ -51,8 +48,7 @@ struct CustomVoiceConfig: Encodable, Sendable {
5148

5249
// MARK: - Encodable conformance
5350

54-
@available(watchOS, unavailable)
55-
extension VoiceConfig: Encodable {
51+
extension ProtoVoiceConfig: Encodable {
5652
enum CodingKeys: CodingKey {
5753
case prebuiltVoiceConfig
5854
case customVoiceConfig
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Copyright 2026 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Configuration for a multi-speaker audio generation setup.
18+
///
19+
/// Enables the model to generate audio containing multiple distinct speakers, alternating voices
20+
/// dynamically based on speaker labels in the prompt.
21+
///
22+
/// > Warning: Multi-speaker configurations are not currently supported by the Live API (e.g.,
23+
/// > `LiveGenerationConfig`).
24+
public struct MultiSpeakerVoiceConfig: Sendable {
25+
let multiSpeakerVoiceConfig: ProtoMultiSpeakerVoiceConfig
26+
27+
init(_ multiSpeakerVoiceConfig: ProtoMultiSpeakerVoiceConfig) {
28+
self.multiSpeakerVoiceConfig = multiSpeakerVoiceConfig
29+
}
30+
31+
/// Creates a configuration for the multi-speaker setup.
32+
///
33+
/// - Parameters:
34+
/// - speakerVoiceConfigs: A list of voice configurations for the participating speakers.
35+
/// Currently, the backend requires exactly **two** speaker voice configurations.
36+
public init(speakerVoiceConfigs: [SpeakerVoiceConfig]) {
37+
self.init(
38+
ProtoMultiSpeakerVoiceConfig(
39+
speakerVoiceConfigs: speakerVoiceConfigs.map(\.speakerVoiceConfig)
40+
)
41+
)
42+
}
43+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Copyright 2026 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Configures a speaker with a unique name/identifier and a specific voice.
18+
public struct SpeakerVoiceConfig: Sendable {
19+
let speakerVoiceConfig: ProtoSpeakerVoiceConfig
20+
21+
init(_ speakerVoiceConfig: ProtoSpeakerVoiceConfig) {
22+
self.speakerVoiceConfig = speakerVoiceConfig
23+
}
24+
25+
/// Creates a configuration for a speaker using a voice name.
26+
///
27+
/// - Parameters:
28+
/// - speaker: The unique name/identifier of the speaker (e.g., `"Alice"`).
29+
/// - voiceName: The name of the preset voice to assign to this speaker.
30+
///
31+
/// Find the list of supported voices for:
32+
/// - [Gemini Developer API](https://ai.google.dev/gemini-api/docs/speech-generation)
33+
/// - [Vertex AI Gemini API](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts)
34+
// TODO(b/522397979): Update links to point to Firebase when they're live
35+
public init(speaker: String, voiceName: String) {
36+
self.init(
37+
ProtoSpeakerVoiceConfig(
38+
speaker: speaker,
39+
voiceConfig: .prebuiltVoiceConfig(ProtoPrebuiltVoiceConfig(voiceName: voiceName))
40+
)
41+
)
42+
}
43+
}

0 commit comments

Comments
 (0)