openvinotoolkit · michalkulakowski · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/demos/audio/README.md b/demos/audio/README.md
@@ -16,6 +16,17 @@ Check supported [Speech Recognition Models](https://openvinotoolkit.github.io/op
 **Client**: curl or Python for using OpenAI client package
 
 ## Speech generation
+### Prepare speaker embeddings
+When generating speech you can use default speaker voice or you can prepaere your own speaker embedding file. Here you can see how to do it with downloaded file from online repository, but you can try with your own speech recorded as well:
+```bash
+pip install -r requirements.txt
+mkdir -p audio_samples
+curl --output audio_samples/audio.wav "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0032_8k.wav"
+mkdir -p models
-mkdir -p models
-mkdir -p models
+mkdir -p models/speakers
+python create_speaker_embedding.py audio_samples/audio.wav models/speakers/voice1.bin
+```
+
 ### Model preparation
 Supported models should use the topology of [microsoft/speecht5_tts](https://huggingface.co/microsoft/speecht5_tts) which needs to be converted to IR format before using in OVMS.
 
@@ -40,48 +51,14 @@ Run `export_model.py` script to download and quantize the model:
 
 **CPU**
 ```console
-python export_model.py text2speech --source_model microsoft/speecht5_tts --weight-format fp16 --model_name microsoft/speecht5_tts --config_file_path models/config.json --model_repository_path models --overwrite_models --vocoder microsoft/speecht5_hifigan
+python export_model.py text2speech --source_model microsoft/speecht5_tts --weight-format fp16 --model_name microsoft/speecht5_tts --config_file_path models/config.json --model_repository_path models --overwrite_models --vocoder microsoft/speecht5_hifigan --speaker_name voice1 --speaker_path /models/speakers/voice1.bin
 ```
 
 > **Note:** Change the `--weight-format` to quantize the model to `int8` precision to reduce memory consumption and improve performance.
+> **Note:** `speaker_name` and `speaker_path` may be omitted if the default model voice is sufficient
 
 The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [T2s calculator documentation](../../docs/speech_generation/reference.md) to learn more about configuration options and limitations.
 
-### Speaker embeddings
-
-Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py)
-```bash
-curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py"
-python create_speaker_embedding.py
-mv speaker_embedding.bin models/
-```
-Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains your speaker embedding.
-Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph:
-```
-input_stream: "HTTP_REQUEST_PAYLOAD:input"
-output_stream: "HTTP_RESPONSE_PAYLOAD:output"
-node {
-  name: "T2sExecutor"
-  input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
-  calculator: "T2sCalculator"
-  input_stream: "HTTP_REQUEST_PAYLOAD:input"
-  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
-  node_options: {
-    [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
-      models_path: "./",
-      plugin_config: '{ "NUM_STREAMS": "1" }',
-      target_device: "CPU",
-      voices: [
-        {
-          name: "voice",
-          path: "/models/speaker_embedding.bin",
-        }
-      ]
-    }
-  }
-}
-```
-
 ### Deployment
 
 **CPU**
@@ -101,14 +78,48 @@ ovms --rest_port 8000 --source_model microsoft/speecht5_tts --model_repository_p
 
 ### Request Generation 
 
-:::{dropdown} **Unary call with curl**
+:::{dropdown} **Unary call with curl with default voice**
 
 
 ```bash
 curl http://localhost:8000/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"microsoft/speecht5_tts\", \"input\": \"The quick brown fox jumped over the lazy dog\"}" -o speech.wav
 ```
 :::
 
+:::{dropdown} **Unary call with OpenAi python library with default voice**
+
+```python
+from pathlib import Path
+from openai import OpenAI
+
+prompt = "The quick brown fox jumped over the lazy dog"
+filename = "speech.wav"
+url="http://localhost:8000/v3"
+
+
+speech_file_path = Path(__file__).parent / "speech.wav"
+client = OpenAI(base_url=url, api_key="not_used")
+
+with client.audio.speech.with_streaming_response.create(
+  model="microsoft/speecht5_tts",
+  voice=None,
+  input=prompt
+) as response:
+  response.stream_to_file(speech_file_path)
+
+
+print("Generation finished")
+```
+:::
+
+:::{dropdown} **Unary call with curl**
+
+
+```bash
+curl http://localhost:8000/v3/audio/speech -H "Content-Type: application/json" -d "{\"model\": \"microsoft/speecht5_tts\", \"voice\":\"voice1\", \"input\": \"The quick brown fox jumped over the lazy dog\"}" -o speech.wav
+```
+:::
+
 :::{dropdown} **Unary call with OpenAi python library**
 
 ```python
@@ -125,7 +136,7 @@ client = OpenAI(base_url=url, api_key="not_used")
 
 with client.audio.speech.with_streaming_response.create(
   model="microsoft/speecht5_tts",
-  voice="unused",
+  voice="voice1",
   input=prompt
 ) as response:
   response.stream_to_file(speech_file_path)

diff --git a/demos/audio/create_speaker_embedding.py b/demos/audio/create_speaker_embedding.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# Copyright (C) 2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torchaudio
+from speechbrain.inference.speaker import EncoderClassifier
+import sys
+
+file = sys.argv[1]
+signal, fs = torchaudio.load(file)
-file = sys.argv[1]
-signal, fs = torchaudio.load(file)
+input_audio_file = sys.argv[1]
+signal, fs = torchaudio.load(input_audio_file)
-file = sys.argv[1]
-signal, fs = torchaudio.load(file)
+input_audio_file = sys.argv[1]
+signal, fs = torchaudio.load(input_audio_file)
+if signal.shape[0] > 1:
+    signal = torch.mean(signal, dim=0, keepdim=True)
+expected_sample_rate = 16000
+if(fs != expected_sample_rate):
+    resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=expected_sample_rate)
+    signal = resampler(signal)
+
+classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb")
+embedding = classifier.encode_batch(signal)
+embedding = torch.nn.functional.normalize(embedding, dim=2)
-embedding = classifier.encode_batch(signal)
-embedding = torch.nn.functional.normalize(embedding, dim=2)
+classifier.eval()
+with torch.no_grad():
+    embedding = classifier.encode_batch(signal)
+    embedding = torch.nn.functional.normalize(embedding, dim=2)
-embedding = classifier.encode_batch(signal)
-embedding = torch.nn.functional.normalize(embedding, dim=2)
+classifier.eval()
+with torch.no_grad():
+    embedding = classifier.encode_batch(signal)
+    embedding = torch.nn.functional.normalize(embedding, dim=2)
+embedding = embedding.squeeze().cpu().numpy().astype("float32")
+
+output_file = sys.argv[2]
+embedding.tofile(output_file)
diff --git a/demos/audio/requirements.txt b/demos/audio/requirements.txt
@@ -0,0 +1,4 @@
+torch
+torchaudio
+speechbrain
+openai
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -91,6 +91,9 @@ def add_common_arguments(parser):
 add_common_arguments(parser_text2speech)
 parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
 parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder')
+parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker', dest='speaker_name')
+parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file.', dest='speaker_path')
+
 
 parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint')
 add_common_arguments(parser_speech2text)
@@ -110,7 +113,14 @@ def add_common_arguments(parser):
     [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
       models_path: "{{model_path}}",
       plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
-      target_device: "{{target_device|default("CPU", true)}}"
+      target_device: "{{target_device|default("CPU", true)}}",
+      {%- if speaker_name and speaker_path %}
+      voices: [
+        {
+            name: "{{speaker_name}}",
+            path: "{{speaker_path}}"
+        }
+      ]{% endif %}
     }
   }
 }