Add speech processing use case

yatarkan · yatarkan · commit ea907962c562 · 2025-03-25T17:59:24.000+04:00
diff --git a/site/docs/use-cases/speech-processing.md b/site/docs/use-cases/speech-processing.md
diff --git a/site/docs/use-cases/speech-processing/_sections/_run_model/_code_example_cpp.mdx b/site/docs/use-cases/speech-processing/_sections/_run_model/_code_example_cpp.mdx
@@ -0,0 +1,19 @@
+import CodeBlock from '@theme/CodeBlock';
+
+<CodeBlock language="cpp" showLineNumbers>
+{`#include "openvino/genai/whisper_pipeline.hpp"
+#include "audio_utils.hpp"
+#include <iostream>
+
+int main(int argc, char* argv[]) {
+    std::filesystem::path models_path = argv[1];
+    std::string wav_file_path = argv[2];
+
+    ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path);
+
+    ov::genai::WhisperPipeline pipe(models_path, "${props.device || 'CPU'}");
+    auto result = pipe.generate(raw_speech, ov::genai::max_new_tokens(100));
+    std::cout << result << std::endl;
+}
+`}
+</CodeBlock>
diff --git a/site/docs/use-cases/speech-processing/_sections/_run_model/_code_example_python.mdx b/site/docs/use-cases/speech-processing/_sections/_run_model/_code_example_python.mdx
@@ -0,0 +1,17 @@
+import CodeBlock from '@theme/CodeBlock';
+
+<CodeBlock language="python" showLineNumbers>
+{`import openvino_genai as ov_genai
+import librosa
+
+def read_wav(filepath):
+    raw_speech, samplerate = librosa.load(filepath, sr=16000)
+    return raw_speech.tolist()
+
+raw_speech = read_wav('sample.wav')
+
+pipe = ov_genai.WhisperPipeline(model_path, "${props.device || 'CPU'}")
+result = pipe.generate(raw_speech, max_new_tokens=100)
+print(result)
+`}
+</CodeBlock>
diff --git a/site/docs/use-cases/speech-processing/_sections/_run_model/index.mdx b/site/docs/use-cases/speech-processing/_sections/_run_model/index.mdx
@@ -0,0 +1,41 @@
+import CodeExampleCPP from './_code_example_cpp.mdx';
+import CodeExamplePython from './_code_example_python.mdx';
+
+## Run Model Using OpenVINO GenAI
+
+OpenVINO GenAI introduces the [`WhisperPipeline`](https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.WhisperPipeline.html) pipeline for inference of speech processing Whisper models.
+You can construct it straight away from the folder with the converted model.
+It will automatically load the model, tokenizer, detokenizer and default generation configuration.
+
+:::info
+`WhisperPipeline` expects normalized audio files in WAV format at sampling rate of 16 kHz as input.
+:::
+
+<LanguageTabs>
+    <TabItemPython>
+        <Tabs groupId="device">
+            <TabItem label="CPU" value="cpu">
+                <CodeExamplePython device="CPU" />
+            </TabItem>
+            <TabItem label="GPU" value="gpu">
+                <CodeExamplePython device="GPU" />
+            </TabItem>
+        </Tabs>
+    </TabItemPython>
+    <TabItemCpp>
+        <Tabs groupId="device">
+            <TabItem label="CPU" value="cpu">
+                <CodeExampleCPP device="CPU" />
+            </TabItem>
+            <TabItem label="GPU" value="gpu">
+                <CodeExampleCPP device="GPU" />
+            </TabItem>
+        </Tabs>
+    </TabItemCpp>
+</LanguageTabs>
+
+:::tip
+
+Use CPU or GPU as devices without any other code change.
+
+:::
diff --git a/site/docs/use-cases/speech-processing/_sections/_usage_options/index.mdx b/site/docs/use-cases/speech-processing/_sections/_usage_options/index.mdx
@@ -0,0 +1,168 @@
+import BasicGenerationConfiguration from '@site/docs/use-cases/_shared/_basic_generation_configuration.mdx';
+import GenerationConfigurationWorkflow from '@site/docs/use-cases/_shared/_generation_configuration_workflow.mdx';
+
+## Additional Usage Options
+
+:::tip
+Check out [Python](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/whisper_speech_recognition) and [C++](https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/whisper_speech_recognition) Whisper speech recognition samples.
+:::
+
+### Use Different Generation Parameters
+
+<GenerationConfigurationWorkflow />
+
+:::info
+For the full list of generation parameters, refer to the [Whisper Generation Config API](https://docs.openvino.ai/2025/api/genai_api/_autosummary/openvino_genai.WhisperGenerationConfig.html).
+:::
+
+### Transcription
+
+Whisper models can automatically detect the language of the input audio, or you can specify the language to improve accuracy:
+
+<LanguageTabs>
+    <TabItemPython>
+        ```python
+        pipe = ov_genai.WhisperPipeline(model_path, "CPU")
+
+        # Automatic language detection
+        raw_speech = read_wav("speech_sample.wav")
+        result = pipe.generate(raw_speech)
+
+        # Explicitly specify language (English)
+        result = pipe.generate(raw_speech, language="<|en|>")
+
+        # French speech sample
+        raw_speech = read_wav("french_sample.wav")
+        result = pipe.generate(raw_speech, language="<|fr|>")
+        ```
+    </TabItemPython>
+    <TabItemCpp>
+        ```cpp
+        int main() {
+            ov::genai::WhisperPipeline pipe(model_path, "CPU");
+
+            // Automatic language detection
+            auto result = pipe.generate(raw_speech);
+
+            // Explicitly specify language (English)
+            result = pipe.generate(raw_speech, ov::genai::language("<|en|>"));
+
+            // French speech sample
+            raw_speech = utils::audio::read_wav("french_sample.wav");
+            result = pipe.generate(raw_speech, ov::genai::language("<|fr|>"));
+        }
+        ```
+    </TabItemCpp>
+</LanguageTabs>
+
+### Translation
+
+By default, Whisper performs transcription, keeping the output in the same language as the input.
+To translate non-English speech to English, use the `translate` task:
+
+<LanguageTabs>
+    <TabItemPython>
+        ```python
+        pipe = ov_genai.WhisperPipeline(model_path, "CPU")
+
+        # Translate French audio to English
+        raw_speech = read_wav("french_sample.wav")
+        result = pipe.generate(raw_speech, task="translate")
+        ```
+    </TabItemPython>
+    <TabItemCpp>
+        ```cpp
+        int main() {
+            ov::genai::WhisperPipeline pipe(model_path, "CPU");
+
+            // Translate French audio to English
+            raw_speech = utils::audio::read_wav("french_sample.wav");
+            result = pipe.generate(raw_speech, ov::genai::task("translate"));
+        }
+        ```
+    </TabItemCpp>
+</LanguageTabs>
+
+### Timestamps Prediction
+
+Whisper can predict timestamps for each segment of speech, which is useful for synchronization or creating subtitles:
+
+<LanguageTabs>
+    <TabItemPython>
+        ```python
+        pipe = ov_genai.WhisperPipeline(model_path, "CPU")
+
+        # Enable timestamp prediction
+        result = pipe.generate(raw_speech, return_timestamps=True)
+
+        # Print timestamps and text segments
+        for chunk in result.chunks:
+            print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
+        ```
+    </TabItemPython>
+    <TabItemCpp>
+        ```cpp
+        int main() {
+            ov::genai::WhisperPipeline pipe(model_path, "CPU");
+
+            // Enable timestamp prediction
+            result = pipe.generate(raw_speech, ov::genai::return_timestamps(true));
+
+            // Print timestamps and text segments
+            for (auto& chunk : *result.chunks) {
+                std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts
+                          << "] text: " << chunk.text << "\n";
+            }
+        }
+        ```
+    </TabItemCpp>
+</LanguageTabs>
+
+### Long-Form Audio Processing
+
+Whisper models are designed for audio segments up to 30 seconds in length.
+For longer audio, the OpenVINO GenAI Whisper pipeline automatically handles the processing using a sequential chunking algorithm ("sliding window"):
+
+1. The audio is divided into 30-second segments
+2. Each segment is processed sequentially
+3. Results are combined to produce the complete transcription
+
+This happens automatically when you input longer audio files.
+
+### Using Initial Prompts and Hotwords
+
+You can improve transcription quality and guide the model's output style by providing initial prompts or hotwords using the following parameters:
+
+- `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window.
+- `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+
+Whisper models can use that context to better understand the speech and maintain a consistent writing style.
+However, prompts do not need to be genuine transcripts from prior audio segments.
+Such prompts can be used to steer the model to use particular spellings or styles:
+
+<LanguageTabs>
+    <TabItemPython>
+        ```python
+        pipe = ov_genai.WhisperPipeline(model_path, "CPU")
+
+        result = pipe.generate(raw_speech)
+        # He has gone and gone for good answered Paul Icrom who...
+
+        result = pipe.generate(raw_speech, initial_prompt="Polychrome")
+        # He has gone and gone for good answered Polychrome who...
+        ```
+    </TabItemPython>
+    <TabItemCpp>
+        ```cpp
+        int main() {
+            ov::genai::WhisperPipeline pipe(model_path, "CPU");
+
+            auto result = pipeline.generate(raw_speech);
+            // He has gone and gone for good answered Paul Icrom who...
+
+            result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+            // He has gone and gone for good answered Polychrome who...
+        }
+        ```
+    </TabItemCpp>
+</LanguageTabs>
diff --git a/site/docs/use-cases/speech-processing/index.mdx b/site/docs/use-cases/speech-processing/index.mdx
@@ -0,0 +1,21 @@
+---
+sidebar_position: 3
+---
+import OptimumCLI from '@site/src/components/OptimumCLI';
+import ConvertModelSection from '../_shared/_convert_model.mdx';
+import RunModelSection from './_sections/_run_model/index.mdx';
+import UsageOptionsSection from './_sections/_usage_options/index.mdx';
+
+# Speech Processing Using Whisper
+
+<ConvertModelSection>
+    Download and convert model (e.g. [openai/whisper-base](https://huggingface.co/openai/whisper-base)) to OpenVINO format from Hugging Face:
+
+    <OptimumCLI model='openai/whisper-base' outputDir='whisper_ov' trustRemoteCode />
+
+    See all supported [Speech Processing Models](/docs/supported-models/#speech-processing-models-whisper-based).
+</ConvertModelSection>
+
+<RunModelSection />
+
+<UsageOptionsSection />