Add ElevenLabs TTS provider integration (#2004)

jelveh · web-flow · commit bb752a5bb5dd · 2025-11-21T17:31:21.000-08:00
* Add ElevenLabs TTS provider integration

Adds ElevenLabs text-to-speech support in the backend and frontend. Adds `ElevenLabsTTSService`, updates `PuterAIModule` to register the service, documents configuration, and integrates cost tracking with zero cost in `MeteringService` for now (todo). updates `AI.js` to support 11labs as a provider and adds related tests for `txt2speech` functionality.

* Update 11labs cost map values
diff --git a/src/backend/src/filesystem/definitions/ts/fsentry.js b/src/backend/src/filesystem/definitions/ts/fsentry.js
diff --git a/src/backend/src/modules/puterai/ElevenLabsTTSService.js b/src/backend/src/modules/puterai/ElevenLabsTTSService.js
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2024-present Puter Technologies Inc.
+ *
+ * This file is part of Puter.
+ *
+ * Puter is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+const { Readable } = require('stream');
+const APIError = require('../../api/APIError');
+const BaseService = require('../../services/BaseService');
+const { TypedValue } = require('../../services/drivers/meta/Runtime');
+const { Context } = require('../../util/context');
+
+const DEFAULT_MODEL = 'eleven_multilingual_v2';
+const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Common public "Rachel" sample voice
+const DEFAULT_OUTPUT_FORMAT = 'mp3_44100_128';
+const SAMPLE_AUDIO_URL = 'https://puter-sample-data.puter.site/tts_example.mp3';
+
+const ELEVENLABS_TTS_MODELS = [
+    { id: DEFAULT_MODEL, name: 'Eleven Multilingual v2' },
+    { id: 'eleven_flash_v2_5', name: 'Eleven Flash v2.5' },
+    { id: 'eleven_turbo_v2_5', name: 'Eleven Turbo v2.5' },
+    { id: 'eleven_v3', name: 'Eleven v3 Alpha' },
+];
+
+/**
+ * ElevenLabs text-to-speech provider.
+ * Implements the `puter-tts` interface so the AI module can synthesize speech
+ * using ElevenLabs voices.
+ */
+class ElevenLabsTTSService extends BaseService {
+    /** @type {import('../../services/MeteringService/MeteringService').MeteringService} */
+    get meteringService () {
+        return this.services.get('meteringService').meteringService;
+    }
+
+    static IMPLEMENTS = {
+        ['driver-capabilities']: {
+            supports_test_mode (iface, method_name) {
+                return iface === 'puter-tts' && method_name === 'synthesize';
+            },
+        },
+        ['puter-tts']: {
+            async list_voices () {
+                return this.listVoices();
+            },
+            async list_engines () {
+                return this.listEngines();
+            },
+            async synthesize (params) {
+                return this.synthesize(params);
+            },
+        },
+    };
+
+    async _init () {
+        const svcThere = this.global_config?.services?.elevenlabs ?? this.config?.services?.elevenlabs ?? this.config?.elevenlabs;
+
+        this.apiKey = svcThere?.apiKey ?? svcThere?.api_key ?? svcThere?.key;
+        this.baseUrl = svcThere?.baseUrl ?? 'https://api.elevenlabs.io';
+        this.defaultVoiceId = svcThere?.defaultVoiceId ?? svcThere?.voiceId ?? DEFAULT_VOICE_ID;
+
+        if ( !this.apiKey ) {
+            throw new Error('ElevenLabs API key not configured');
+        }
+    }
+
+    async request (path, { method = 'GET', body, headers = {} } = {}) {
+        const response = await fetch(`${this.baseUrl}${path}`, {
+            method,
+            headers: {
+                'xi-api-key': this.apiKey,
+                ...(body ? { 'Content-Type': 'application/json' } : {}),
+                ...headers,
+            },
+            body: body ? JSON.stringify(body) : undefined,
+        });
+
+        if ( response.ok ) {
+            return response;
+        }
+
+        let detail = null;
+        try {
+            detail = await response.json();
+        } catch ( e ) {
+            // ignore
+        }
+        this.log.error('ElevenLabs request failed', { path, status: response.status, detail });
+        throw APIError.create('internal_server_error', null, { provider: 'elevenlabs', status: response.status });
+    }
+
+    async listVoices () {
+        const res = await this.request('/v1/voices');
+        const data = await res.json();
+        const voices = Array.isArray(data?.voices) ? data.voices : Array.isArray(data) ? data : [];
+
+        return voices
+            .map(voice => ({
+                id: voice.voice_id || voice.voiceId || voice.id,
+                name: voice.name,
+                description: voice.description,
+                category: voice.category,
+                provider: 'elevenlabs',
+                labels: voice.labels,
+                supported_models: ELEVENLABS_TTS_MODELS.map(model => model.id),
+            }))
+            .filter(v => v.id && v.name);
+    }
+
+    async listEngines () {
+        return ELEVENLABS_TTS_MODELS.map(model => ({
+            id: model.id,
+            name: model.name,
+            provider: 'elevenlabs',
+            pricing_per_million_chars: 0,
+        }));
+    }
+
+    async synthesize (params) {
+        const {
+            text,
+            voice,
+            model,
+            response_format,
+            output_format,
+            voice_settings,
+            voiceSettings,
+            test_mode,
+        } = params;
+        if ( test_mode ) {
+            return new TypedValue({
+                $: 'string:url:web',
+                content_type: 'audio',
+            }, SAMPLE_AUDIO_URL);
+        }
+
+        if ( typeof text !== 'string' || !text.trim() ) {
+            throw APIError.create('field_required', null, { key: 'text' });
+        }
+
+        const voiceId = voice || this.defaultVoiceId;
+        const modelId = model || DEFAULT_MODEL;
+        const desiredFormat = output_format || response_format || DEFAULT_OUTPUT_FORMAT;
+
+        const actor = Context.get('actor');
+        const usageKey = `elevenlabs:${modelId}:character`;
+        const usageAllowed = await this.meteringService.hasEnoughCreditsFor(actor, usageKey, text.length);
+        if ( !usageAllowed ) {
+            throw APIError.create('insufficient_funds');
+        }
+
+        const payload = {
+            text,
+            model_id: modelId,
+            output_format: desiredFormat,
+        };
+
+        const finalVoiceSettings = voice_settings ?? voiceSettings;
+        if ( finalVoiceSettings ) {
+            payload.voice_settings = finalVoiceSettings;
+        }
+
+        const response = await this.request(`/v1/text-to-speech/${voiceId}`, {
+            method: 'POST',
+            body: payload,
+        });
+
+        const arrayBuffer = await response.arrayBuffer();
+        const buffer = Buffer.from(arrayBuffer);
+        const stream = Readable.from(buffer);
+
+        this.meteringService.incrementUsage(actor, usageKey, text.length);
+
+        return new TypedValue({
+            $: 'stream',
+            content_type: response.headers.get('content-type') || 'audio/mpeg',
+        }, stream);
+    }
+}
+
+module.exports = {
+    ElevenLabsTTSService,
+};
diff --git a/src/backend/src/modules/puterai/PuterAIModule.js b/src/backend/src/modules/puterai/PuterAIModule.js
@@ -55,6 +55,11 @@ class PuterAIModule extends AdvancedBase {
             services.registerService('aws-polly', AWSPollyService);
         }
 
+        if ( config?.services?.['elevenlabs'] || config?.elevenlabs ) {
+            const { ElevenLabsTTSService } = require('./ElevenLabsTTSService');
+            services.registerService('elevenlabs-tts', ElevenLabsTTSService);
+        }
+
         if ( config?.services?.openai || config?.openai ) {
             const { OpenAICompletionServiceWrapper } = require('./OpenAiCompletionService/index.mjs');
             services.registerService('openai-completion', OpenAICompletionServiceWrapper);
diff --git a/src/backend/src/modules/puterai/doc/ai-services-config.md b/src/backend/src/modules/puterai/doc/ai-services-config.md
@@ -9,6 +9,10 @@ AI services are configured under the `services` block in the configuration file.
     "openai": {
       "apiKey": "sk-abcdefg..."
     },
+    "elevenlabs": {
+      "apiKey": "eleven-api-key",
+      "defaultVoiceId": "optional-voice-id"
+    },
     "deepseek": {
       "apiKey": "sk-xyz123..."
     },
diff --git a/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts b/src/backend/src/services/MeteringService/costMaps/elevenlabsCostMap.ts
@@ -0,0 +1,13 @@
+// ElevenLabs Text-to-Speech Cost Map
+//
+// Pricing for ElevenLabs voices varies by model and plan tier. We don't yet
+// have public micro-cent pricing, so we record usage with a zero cost. This
+// prevents metering alerts while still tracking character counts for future
+// cost attribution once pricing is finalized.
+
+export const ELEVENLABS_COST_MAP = {
+    'elevenlabs:eleven_multilingual_v2:character': 11,
+    'elevenlabs:eleven_turbo_v2_5:character': 11,
+    'elevenlabs:eleven_flash_v2_5:character': 5.5,
+    'elevenlabs:eleven_v3:character': 11,
+};
diff --git a/src/backend/src/services/MeteringService/costMaps/index.ts b/src/backend/src/services/MeteringService/costMaps/index.ts
@@ -13,12 +13,14 @@ import { OPENROUTER_COST_MAP } from './openrouterCostMap';
 import { OPENAI_VIDEO_COST_MAP } from './openaiVideoCostMap';
 import { TOGETHER_COST_MAP } from './togetherCostMap';
 import { XAI_COST_MAP } from './xaiCostMap';
+import { ELEVENLABS_COST_MAP } from './elevenlabsCostMap';
 
 export const COST_MAPS = {
     ...AWS_POLLY_COST_MAP,
     ...AWS_TEXTRACT_COST_MAP,
     ...CLAUDE_COST_MAP,
     ...DEEPSEEK_COST_MAP,
+    ...ELEVENLABS_COST_MAP,
     ...GEMINI_COST_MAP,
     ...GROQ_COST_MAP,
     ...KV_COST_MAP,
diff --git a/src/puter-js/src/modules/AI.js b/src/puter-js/src/modules/AI.js
@@ -6,6 +6,7 @@ const normalizeTTSProvider = (value) => {
     }
     const lower = value.toLowerCase();
     if ( lower === 'openai' ) return 'openai';
+    if ( ['elevenlabs', 'eleven', '11labs', '11-labs', 'eleven-labs', 'elevenlabs-tts'].includes(lower) ) return 'elevenlabs';
     if ( lower === 'aws' || lower === 'polly' || lower === 'aws-polly' ) return 'aws-polly';
     return value;
 };
@@ -281,6 +282,10 @@ class AI {
             provider = 'openai';
         }
 
+        if ( options.engine && normalizeTTSProvider(options.engine) === 'elevenlabs' && !options.provider ) {
+            provider = 'elevenlabs';
+        }
+
         if ( provider === 'openai' ) {
             if ( !options.model && typeof options.engine === 'string' ) {
                 options.model = options.engine;
@@ -295,6 +300,23 @@ class AI {
                 options.response_format = 'mp3';
             }
             delete options.engine;
+        } else if ( provider === 'elevenlabs' ) {
+            if ( ! options.voice ) {
+                options.voice = '21m00Tcm4TlvDq8ikWAM';
+            }
+            if ( ! options.model && typeof options.engine === 'string' ) {
+                options.model = options.engine;
+            }
+            if ( ! options.model ) {
+                options.model = 'eleven_multilingual_v2';
+            }
+            if ( ! options.output_format && !options.response_format ) {
+                options.output_format = 'mp3_44100_128';
+            }
+            if ( options.response_format && !options.output_format ) {
+                options.output_format = options.response_format;
+            }
+            delete options.engine;
         } else {
             provider = 'aws-polly';
 
@@ -326,7 +348,9 @@ class AI {
             }
         }
 
-        const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
+        const driverName = provider === 'openai'
+            ? 'openai-tts'
+            : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
 
         return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'synthesize', {
             responseType: 'blob',
@@ -449,7 +473,13 @@ class AI {
                 params.provider = 'openai';
             }
 
-            const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
+            if ( provider === 'elevenlabs' ) {
+                params.provider = 'elevenlabs';
+            }
+
+            const driverName = provider === 'openai'
+                ? 'openai-tts'
+                : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
 
             return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_engines', {
                 responseType: 'text',
@@ -478,7 +508,13 @@ class AI {
                 delete params.engine;
             }
 
-            const driverName = provider === 'openai' ? 'openai-tts' : 'aws-polly';
+            if ( provider === 'elevenlabs' ) {
+                params.provider = 'elevenlabs';
+            }
+
+            const driverName = provider === 'openai'
+                ? 'openai-tts'
+                : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
 
             return utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_voices', {
                 responseType: 'text',
diff --git a/src/puter-js/test/txt2speech.test.js b/src/puter-js/test/txt2speech.test.js
@@ -157,6 +157,32 @@ const testTxt2SpeechWithOpenAIProviderCore = async function() {
     assert(valueOfValue === srcValue, "valueOf() should match src for OpenAI provider");
 };
 
+const testTxt2SpeechWithElevenLabsProviderCore = async function() {
+    // Test ElevenLabs provider in test mode to avoid external calls
+    const result = await puter.ai.txt2speech(
+        "Hello, this is an ElevenLabs provider test.",
+        { provider: "elevenlabs", voice: "21m00Tcm4TlvDq8ikWAM" },
+        true,
+    );
+
+    assert(result instanceof Audio, "txt2speech should return an Audio object for ElevenLabs provider");
+    assert(result !== null, "txt2speech should not return null for ElevenLabs provider");
+
+    const toStringValue = result.toString();
+    const valueOfValue = result.valueOf();
+    const srcValue = result.src;
+
+    assert(typeof toStringValue === 'string', "toString() should return a string for ElevenLabs provider");
+    assert(typeof valueOfValue === 'string', "valueOf() should return a string for ElevenLabs provider");
+    assert(typeof srcValue === 'string', "src should be a string for ElevenLabs provider");
+    assert(toStringValue.length > 0, "toString() should not be empty for ElevenLabs provider");
+    assert(valueOfValue.length > 0, "valueOf() should not be empty for ElevenLabs provider");
+    assert(srcValue.length > 0, "src should not be empty for ElevenLabs provider");
+
+    assert(toStringValue === srcValue, "toString() should match src for ElevenLabs provider");
+    assert(valueOfValue === srcValue, "valueOf() should match src for ElevenLabs provider");
+};
+
 // Export test functions
 window.txt2speechTests = [
     {
@@ -209,5 +235,18 @@ window.txt2speechTests = [
                 fail("testTxt2SpeechWithOpenAIProvider failed:", error);
             }
         }
+    },
+
+    {
+        name: "testTxt2SpeechWithElevenLabsProvider",
+        description: "Test text-to-speech using the ElevenLabs provider in test mode",
+        test: async function() {
+            try {
+                await testTxt2SpeechWithElevenLabsProviderCore();
+                pass("testTxt2SpeechWithElevenLabsProvider passed");
+            } catch (error) {
+                fail("testTxt2SpeechWithElevenLabsProvider failed:", error);
+            }
+        }
     }
 ];