[Docs]: Add whisper word level timestamps (openvinotoolkit#3231)

as-suvorov · web-flow · commit 3d50de7708af · 2026-02-02T15:44:12.000Z
&lt;!-- Keep your pull requests (PRs) as atomic as possible. That increases
the likelihood that an individual PR won't be stuck because of adjacent
problems, merge conflicts, or code review.
Your merged PR is going to appear in the automatically generated release
notes on GitHub. So the clearer the title the better. --&gt;
## Description
&lt;!-- Please include a summary of the change. Also include relevant
motivation and context. --&gt;
* Add documentation
* Enable `word_timestamps` in samples
* Adjust samples tests for C API which has no `word_timestamps` bindings
* Remove transformers downgrade for mac workflow. As workflow now uses
arm arch.

&lt;!-- Jira ticket number (e.g., 123). Delete if there's no ticket. --&gt;
CVS-179419
CVS-179417


## Checklist:
- [x] This patch fully addresses the ticket. &lt;!--- If follow-up pull
requests are needed, specify in description. --&gt;
- [x] I have made corresponding changes to the documentation. &lt;!-- Run
github.com/\&lt;username&gt;/openvino.genai/actions/workflows/deploy_gh_pages.yml
on your fork with your branch as a parameter to deploy a test version
with the updated content. Replace this comment with the link to the
built docs. --&gt;
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -600,13 +600,6 @@ jobs:
           requirements_files: "${{ env.SRC_DIR }}/samples/requirements.txt"
           local_wheel_dir: ${{ env.INSTALL_DIR }}/wheels
 
-      # transformers >= 4.52 require torch >= 2.6 and raise an error otherwise:
-      # ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
-      # See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434
-
-      # x86_64 macOS does not (and will not) support newer versions of torch > 2.2 which are used in the newer transformers versions. It's not possible to lower transformer version in requirements.txt because that triggers vulnerability alert: https://github.com/openvinotoolkit/openvino_tokenizers/security/dependabot/11
-      - run: python -m pip install "transformers<4.52"
-
       - name: Fix C++ samples permissions
         if: ${{ matrix.test.run_condition }}
         run: chmod +x ${{ env.INSTALL_DIR }}/samples_bin/*
diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -20,19 +20,25 @@ int main(int argc, char* argv[]) try {
     std::string device = (argc == 4) ? argv[3] : "CPU";  // Default to CPU if no device is provided
 
     ov::AnyMap ov_config;
-    if (device == "NPU" || device.find("GPU") != std::string::npos) {  // need to handle cases like "GPU", "GPU.0" and "GPU.1"
+    if (device == "NPU" ||
+        device.find("GPU") != std::string::npos) {  // need to handle cases like "GPU", "GPU.0" and "GPU.1"
         // Cache compiled models on disk for GPU and NPU to save time on the
         // next run. It's not beneficial for CPU.
         ov_config = get_config_for_cache();
     }
 
+    // Word timestamps require decomposition of cross-attention decoder SDPA layers,
+    // so word_timestamps must be passed to the pipeline constructor (not just in generation config)
+    ov_config.insert(ov::genai::word_timestamps(true));
+
     ov::genai::WhisperPipeline pipeline(models_path, device, ov_config);
 
     ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config();
     // 'task' and 'language' parameters are supported for multilingual models only
     config.language = "<|en|>";  // can switch to <|zh|> for Chinese language
     config.task = "transcribe";
     config.return_timestamps = true;
+    config.word_timestamps = true;
 
     // Pipeline expects normalized audio with Sample Rate of 16kHz
     ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path);
@@ -45,6 +51,10 @@ int main(int argc, char* argv[]) try {
         std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
     }
 
+    for (auto& word : *result.words) {
+        std::cout << "[" << word.start_ts << ", " << word.end_ts << "]: " << word.word << "\n";
+    }
+
 } catch (const std::exception& error) {
     try {
         std::cerr << error.what() << '\n';
diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -11,11 +11,13 @@ def read_wav(filepath):
     raw_speech, samplerate = librosa.load(filepath, sr=16000)
     return raw_speech.tolist()
 
+
 def get_config_for_cache():
     config_cache = dict()
     config_cache["CACHE_DIR"] = "whisper_cache"
     return config_cache
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("model_dir", help="Path to the model directory")
@@ -24,18 +26,23 @@ def main():
     args = parser.parse_args()
 
     ov_config = dict()
-    if args.device == "NPU" or "GPU" in args.device: # need to handle cases like "GPU", "GPU.0" and "GPU.1"
+    if args.device == "NPU" or "GPU" in args.device:  # need to handle cases like "GPU", "GPU.0" and "GPU.1"
         # Cache compiled models on disk for GPU and NPU to save time on the
         # next run. It's not beneficial for CPU.
         ov_config = get_config_for_cache()
 
+    # Word timestamps require decomposition of cross-attention decoder SDPA layers,
+    # so word_timestamps must be passed to the pipeline constructor (not just in generation config)
+    ov_config["word_timestamps"] = True
+
     pipe = openvino_genai.WhisperPipeline(args.model_dir, args.device, **ov_config)
 
     config = pipe.get_generation_config()
     # 'task' and 'language' parameters are supported for multilingual models only
     config.language = "<|en|>"  # can switch to <|zh|> for Chinese language
     config.task = "transcribe"
     config.return_timestamps = True
+    config.word_timestamps = True
 
     # Pipeline expects normalized audio with Sample Rate of 16kHz
     raw_speech = read_wav(args.wav_file_path)
@@ -47,6 +54,10 @@ def main():
         for chunk in result.chunks:
             print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
 
+    if result.words:
+        for word in result.words:
+            print(f"[{word.start_ts:.2f}, {word.end_ts:.2f}]: {word.word}")
+
 
 if "__main__" == __name__:
     main()
diff --git a/site/docs/use-cases/speech-recognition/_sections/_usage_options/index.mdx b/site/docs/use-cases/speech-recognition/_sections/_usage_options/index.mdx
@@ -204,6 +204,50 @@ Whisper can predict timestamps for each segment of speech, which is useful for s
     </TabItemCpp>
 </LanguageTabs>
 
+### Word-level Timestamps Prediction
+
+Whisper can predict timestamps for each word of speech, which provides more granular timing information compared to segment-level timestamps.
+
+<LanguageTabs>
+    <TabItemPython>
+        ```python
+        # Word timestamps require decomposition of cross-attention decoder SDPA layers,
+        # so word_timestamps must be passed to the pipeline constructor (not just in generation config)
+        pipe = openvino_genai.WhisperPipeline(model_path, "CPU", word_timestamps=True)
+
+        # Enable word-level timestamp prediction
+        result = pipe.generate(raw_speech, word_timestamps=True)
+
+        # Print word-level timestamps
+        for word in result.words:
+            print(f"[{word.start_ts:.2f}, {word.end_ts:.2f}]: {word.word}")
+        ```
+    </TabItemPython>
+    <TabItemCpp>
+        ```cpp
+        int main() {
+            // Word timestamps require decomposition of cross-attention decoder SDPA layers,
+            // so word_timestamps must be passed to the pipeline constructor (not just in generation config)
+            ov::genai::WhisperPipeline pipeline(model_path, "CPU", ov::genai::word_timestamps(true));
+
+            // Enable word-level timestamp prediction
+            auto result = pipeline.generate(raw_speech, ov::genai::word_timestamps(true));
+
+            // Print word-level timestamps
+            std::cout << std::fixed << std::setprecision(2);
+            for (auto& word : *result.words) {
+                std::cout << "[" << word.start_ts << ", " << word.end_ts << "]: " << word.word << "\n";
+            }
+        }
+        ```
+    </TabItemCpp>
+</LanguageTabs>
+
+:::info
+NPU device requires `STATIC_PIPELINE=True` property passed to `WhisperPipeline` constructor:
+`openvino_genai.WhisperPipeline(model_path, "NPU", word_timestamps=True, STATIC_PIPELINE=True)`
+:::
+
 ### Long-Form Audio Processing
 
 Whisper models are designed for audio segments up to 30 seconds in length.
diff --git a/site/src/pages/_sections/UseCasesSection/components/speech-recognition.tsx b/site/src/pages/_sections/UseCasesSection/components/speech-recognition.tsx
@@ -16,8 +16,9 @@ export const SpeechRecognition = () => (
     <UseCaseCard.Features>
       <li>Translate foreign language speech directly to English text</li>
       <li>Transcribe audio in multiple languages with automatic language detection</li>
-      <li>Generate precise timestamps for synchronized subtitles and captions</li>
       <li>Process long-form audio content (&gt;30 seconds) efficiently</li>
+      <li>Generate precise timestamps for synchronized subtitles and captions</li>
+      <li>Generate word-level timestamps for detailed transcription</li>
     </UseCaseCard.Features>
     <UseCaseCard.Code>
       <LanguageTabs>
diff --git a/tests/python_tests/samples/test_whisper_speech_recognition.py b/tests/python_tests/samples/test_whisper_speech_recognition.py
@@ -1,21 +1,38 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import pytest
 import sys
+import re
 
 from conftest import SAMPLES_PY_DIR, SAMPLES_CPP_DIR, SAMPLES_C_DIR
 from test_utils import run_sample
 
+
+def filter_word_level_timestamps(text: str) -> str:
+    """
+    example:
+     How are you doing today?
+    timestamps: [0.00, 2.00] text:  How are you doing today?
+    [0.00, 0.58]:  How
+    [0.58, 0.70]:  are
+    [0.70, 0.80]:  you
+    [0.80, 1.06]:  doing
+    [1.06, 1.40]:  today?
+    """
+    pattern = r"\[\d+\.\d{2}, \d+\.\d{2}\]:\s+\S+"
+    filtered_text = re.sub(pattern, "", text).strip()
+    return filtered_text
+
+
 class TestWhisperSpeechRecognition:
     @pytest.mark.whisper
     @pytest.mark.samples
     @pytest.mark.parametrize("convert_model", ["WhisperTiny"], indirect=True)
     @pytest.mark.parametrize("download_test_content", ["how_are_you_doing_today.wav"], indirect=True)
     def test_sample_whisper_speech_recognition(self, convert_model, download_test_content):
         # Run C++ sample
-        cpp_sample = SAMPLES_CPP_DIR / 'whisper_speech_recognition'
+        cpp_sample = SAMPLES_CPP_DIR / "whisper_speech_recognition"
         cpp_command = [cpp_sample, convert_model, download_test_content]
         cpp_result = run_sample(cpp_command)
 
@@ -25,10 +42,14 @@ def test_sample_whisper_speech_recognition(self, convert_model, download_test_co
         py_result = run_sample(py_command)
 
         # Run C sample
-        c_sample = SAMPLES_C_DIR / 'whisper_speech_recognition_c'
+        c_sample = SAMPLES_C_DIR / "whisper_speech_recognition_c"
         c_command = [c_sample, convert_model, download_test_content]
         c_result = run_sample(c_command)
 
         # Compare results
         assert py_result.stdout == cpp_result.stdout, "Python and C++ results should match"
-        assert py_result.stdout == c_result.stdout, "Python and C results should match"
+        # C API has no word-level timestamps support which enabled in Python and CPP samples
+        # ticket to enable C API: 180115
+        assert filter_word_level_timestamps(py_result.stdout) == c_result.stdout.strip(), (
+            "Python and C results should match without word-level timestamps"
+        )