Skip to content

Commit f6fe14f

Browse files
authored
Merge pull request #340 from mucsi96/claude/loving-mayer-r6nz60
Add Google Gemini TTS as switchable audio generation model
2 parents f7e049d + 69c35dd commit f6fe14f

17 files changed

Lines changed: 401 additions & 13 deletions

File tree

AGENTS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ This is a language learning application that uses spaced repetition to help user
5959

6060
### Key Technologies
6161
- **Spaced Repetition**: Uses FSRS (Free Spaced Repetition Scheduler) algorithm via ts-fsrs library
62-
- **AI Integration**: OpenAI GPT-4.1 for translations, ElevenLabs Voices for audio, Google Gemini API for example images
62+
- **AI Integration**: OpenAI GPT-4.1 for translations, ElevenLabs Voices and Google Gemini TTS for audio, Google Gemini API for example images
6363
- **Cloud Services**: Local file system for PDFs/assets storage, Azure AD for authentication
6464
- **PDF Processing**: Apache PDFBox for text extraction and document processing
6565

client/src/app/environment/environment.config.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { InjectionToken } from '@angular/core';
22

3-
export type ModelProvider = 'openai' | 'anthropic' | 'google';
3+
export type ModelProvider = 'openai' | 'anthropic' | 'google' | 'elevenlabs';
44

55
export interface ChatModelInfo {
66
modelName: string;
@@ -17,13 +17,15 @@ export interface AudioModel {
1717
id: string;
1818
displayName: string;
1919
isDefault: boolean;
20+
provider: ModelProvider;
2021
}
2122

2223
export interface Voice {
2324
id: string;
2425
displayName: string;
2526
languages: { name: string }[];
2627
category: 'premade' | 'cloned' | 'generated' | 'professional' | null;
28+
provider: ModelProvider;
2729
}
2830

2931
export interface SupportedLanguage {

client/src/app/voice-config/add-voice-dialog/add-voice-dialog.component.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ <h2 mat-dialog-title>Add Voice Configuration</h2>
4343
<mat-select
4444
[formField]="voiceForm.model"
4545
>
46-
@for (model of models; track model.id) {
46+
@for (model of availableModels(); track model.id) {
4747
<mat-option [value]="model.id">
4848
{{ model.displayName }}
4949
@if (model.isDefault) {

client/src/app/voice-config/add-voice-dialog/add-voice-dialog.component.ts

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,18 @@ export class AddVoiceDialogComponent {
5353
});
5454
readonly voiceForm = form(this.formModel);
5555

56-
readonly models = this.data.audioModels;
56+
readonly availableModels = computed(() => {
57+
const voice = this.formModel().voice;
58+
if (!voice) return this.data.audioModels;
59+
60+
return this.data.audioModels.filter(
61+
(model) => model.provider === voice.provider
62+
);
63+
});
5764

58-
private getDefaultModelId(): string {
59-
const defaultModel = this.data.audioModels.find((m) => m.isDefault);
60-
return defaultModel?.id ?? this.data.audioModels[0]?.id ?? '';
65+
private getDefaultModelId(models = this.data.audioModels): string {
66+
const defaultModel = models.find((m) => m.isDefault);
67+
return defaultModel?.id ?? models[0]?.id ?? '';
6168
}
6269

6370
readonly filteredVoices = computed(() => this.data.availableVoices);
@@ -81,6 +88,7 @@ export class AddVoiceDialogComponent {
8188
this.formModel.update((m) => ({
8289
...m,
8390
language: langs.length === 1 ? langs[0] : '',
91+
model: this.getDefaultModelId(this.availableModels()),
8492
displayName: m.voice?.displayName ?? '',
8593
}));
8694
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
const SAMPLE_RATE = 24000;
2+
const BEEP_DURATION_SECONDS = 0.3;
3+
const BEEP_AMPLITUDE = 12000;
4+
5+
// Raw 16-bit PCM mono sine beeps, matching the format Gemini TTS returns
6+
const generatePcmBeep = (frequency: number): string => {
7+
const sampleCount = Math.floor(SAMPLE_RATE * BEEP_DURATION_SECONDS);
8+
const samples = Int16Array.from({ length: sampleCount }, (_, i) =>
9+
Math.round(Math.sin((2 * Math.PI * frequency * i) / SAMPLE_RATE) * BEEP_AMPLITUDE)
10+
);
11+
return Buffer.from(samples.buffer).toString('base64');
12+
};
13+
14+
export const AUDIO_SAMPLES = {
15+
german: generatePcmBeep(440),
16+
hungarian: generatePcmBeep(660),
17+
};
18+
19+
export class AudioGenerationHandler {
20+
private audioCallCounter = 0;
21+
22+
reset(): void {
23+
this.audioCallCounter = 0;
24+
}
25+
26+
generateAudio(prompt: string, voiceName?: string): string {
27+
this.audioCallCounter++;
28+
29+
const isHungarian = /Hungarian/i.test(prompt);
30+
const audioBase64 = isHungarian ? AUDIO_SAMPLES.hungarian : AUDIO_SAMPLES.german;
31+
32+
console.log(
33+
`Generated TTS audio for prompt: "${prompt}" (language: ${isHungarian ? 'hu' : 'de'}, voice: ${voiceName ?? 'unknown'})`
34+
);
35+
36+
return audioBase64;
37+
}
38+
39+
getCallCount(): number {
40+
return this.audioCallCounter;
41+
}
42+
}

mock_google_ai_server/src/index.ts

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import express from 'express';
22
import { ImageGenerationHandler } from './imageGeneration';
33
import { ChatHandler } from './chatHandler';
4+
import { AudioGenerationHandler } from './audioGeneration';
45

56
const app = express();
67
const imageHandler = new ImageGenerationHandler();
78
const chatHandler = new ChatHandler();
9+
const audioHandler = new AudioGenerationHandler();
810

911
app.use(express.json({ limit: '25mb' }));
1012

@@ -20,7 +22,12 @@ app.use((req, res, next) => {
2022
app.post('/reset', (req, res) => {
2123
imageHandler.reset();
2224
chatHandler.reset();
23-
res.status(200).json({ status: 'ok', message: 'Image counter reset to 0' });
25+
audioHandler.reset();
26+
res.status(200).json({ status: 'ok', message: 'Mock state reset' });
27+
});
28+
29+
app.get('/stats', (req, res) => {
30+
res.status(200).json({ audioCallCount: audioHandler.getCallCount() });
2431
});
2532

2633
app.post('/configure', (req, res) => {
@@ -60,6 +67,38 @@ app.post(
6067
}
6168
);
6269

70+
app.post(
71+
'/v1beta/models/gemini-3.1-flash-tts-preview:generateContent',
72+
(req, res) => {
73+
try {
74+
const prompt = req.body.contents[0].parts[0].text;
75+
const voiceName =
76+
req.body.generationConfig?.speechConfig?.voiceConfig
77+
?.prebuiltVoiceConfig?.voiceName;
78+
const audio = audioHandler.generateAudio(prompt, voiceName);
79+
res.status(200).json({
80+
candidates: [
81+
{
82+
content: {
83+
parts: [
84+
{
85+
inlineData: {
86+
mimeType: 'audio/L16;codec=pcm;rate=24000',
87+
data: audio,
88+
},
89+
},
90+
],
91+
},
92+
},
93+
],
94+
});
95+
} catch (error) {
96+
console.error('Audio generation error:', error);
97+
res.status(500).json({ error: { message: 'Audio generation failed' } });
98+
}
99+
}
100+
);
101+
63102
app.post(/\/v1beta\/models\/([^/]+):generateContent/, async (req, res) => {
64103
try {
65104
const model = req.params[0];

server/src/main/java/io/github/mucsi96/learnlanguage/config/ModelPricingConfig.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ public record AudioModelPricing(BigDecimal perThousandCharacters) {}
4848
private static final Map<String, AudioModelPricing> AUDIO_MODEL_PRICING = Map.of(
4949
// ElevenLabs (approximately $0.20 per 1000 characters)
5050
"eleven_turbo_v2_5", new AudioModelPricing(new BigDecimal("0.20")),
51-
"eleven_v3", new AudioModelPricing(new BigDecimal("0.20"))
51+
"eleven_v3", new AudioModelPricing(new BigDecimal("0.20")),
52+
// Gemini TTS is token-priced; approximated per 1000 characters
53+
"gemini-3.1-flash-tts-preview", new AudioModelPricing(new BigDecimal("0.02"))
5254
);
5355

5456
public ChatModelPricing getChatModelPricing(String modelName) {

server/src/main/java/io/github/mucsi96/learnlanguage/controller/EnvironmentController.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import java.util.Arrays;
44
import java.util.List;
55
import java.util.Map;
6+
import java.util.stream.Stream;
67

78
import org.springframework.beans.factory.annotation.Value;
89
import org.springframework.web.bind.annotation.GetMapping;
@@ -20,6 +21,7 @@
2021
import io.github.mucsi96.learnlanguage.service.AudioSettingService;
2122
import io.github.mucsi96.learnlanguage.service.ChatModelSettingService;
2223
import io.github.mucsi96.learnlanguage.service.ElevenLabsAudioService;
24+
import io.github.mucsi96.learnlanguage.service.GeminiAudioService;
2325
import io.github.mucsi96.learnlanguage.service.ImageModelSettingService;
2426
import io.github.mucsi96.learnlanguage.service.ImageSettingService;
2527
import io.github.mucsi96.learnlanguage.service.RateLimitSettingService;
@@ -30,6 +32,7 @@
3032
public class EnvironmentController {
3133
private final AudioService audioService;
3234
private final ElevenLabsAudioService elevenLabsAudioService;
35+
private final GeminiAudioService geminiAudioService;
3336
private final ChatModelSettingService chatModelSettingService;
3437
private final ImageModelSettingService imageModelSettingService;
3538
private final ImageSettingService imageSettingService;
@@ -101,7 +104,9 @@ public ConfigResponse getConfig() {
101104
.toList(),
102105
imageModelSettingService.getImageModelsWithSettings(),
103106
audioService.getAvailableModels(),
104-
elevenLabsAudioService.getVoices(),
107+
Stream.concat(
108+
elevenLabsAudioService.getVoices().stream(),
109+
geminiAudioService.getVoices().stream()).toList(),
105110
SUPPORTED_LANGUAGES,
106111
enabledModelsByOperation,
107112
primaryModelByOperation,

server/src/main/java/io/github/mucsi96/learnlanguage/model/AudioModelResponse.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ public class AudioModelResponse {
1313
private String id;
1414
private String displayName;
1515
private boolean isDefault;
16+
private ModelProvider provider;
1617
}

server/src/main/java/io/github/mucsi96/learnlanguage/model/ModelProvider.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
public enum ModelProvider {
1111
OPENAI("openai"),
1212
ANTHROPIC("anthropic"),
13-
GOOGLE("google");
13+
GOOGLE("google"),
14+
ELEVENLABS("elevenlabs");
1415

1516
private final String code;
1617

0 commit comments

Comments
 (0)