Skip to content

Commit 3d50de7

Browse files
authored
[Docs]: Add whisper word level timestamps (openvinotoolkit#3231)
<!-- Keep your pull requests (PRs) as atomic as possible. That increases the likelihood that an individual PR won't be stuck because of adjacent problems, merge conflicts, or code review. Your merged PR is going to appear in the automatically generated release notes on GitHub. So the clearer the title the better. --> ## Description <!-- Please include a summary of the change. Also include relevant motivation and context. --> * Add documentation * Enable `word_timestamps` in samples * Adjust samples tests for C API which has no `word_timestamps` bindings * Remove transformers downgrade for mac workflow. As workflow now uses arm arch. <!-- Jira ticket number (e.g., 123). Delete if there's no ticket. --> CVS-179419 CVS-179417 ## Checklist: - [x] This patch fully addresses the ticket. <!--- If follow-up pull requests are needed, specify in description. --> - [x] I have made corresponding changes to the documentation. <!-- Run github.com/\<username>/openvino.genai/actions/workflows/deploy_gh_pages.yml on your fork with your branch as a parameter to deploy a test version with the updated content. Replace this comment with the link to the built docs. -->
1 parent 7a34168 commit 3d50de7

File tree

6 files changed

+94
-14
lines changed

6 files changed

+94
-14
lines changed

.github/workflows/mac.yml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -600,13 +600,6 @@ jobs:
600600
requirements_files: "${{ env.SRC_DIR }}/samples/requirements.txt"
601601
local_wheel_dir: ${{ env.INSTALL_DIR }}/wheels
602602

603-
# transformers >= 4.52 require torch >= 2.6 and raise an error otherwise:
604-
# ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
605-
# See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434
606-
607-
# x86_64 macOS does not (and will not) support newer versions of torch > 2.2 which are used in the newer transformers versions. It's not possible to lower transformer version in requirements.txt because that triggers vulnerability alert: https://github.com/openvinotoolkit/openvino_tokenizers/security/dependabot/11
608-
- run: python -m pip install "transformers<4.52"
609-
610603
- name: Fix C++ samples permissions
611604
if: ${{ matrix.test.run_condition }}
612605
run: chmod +x ${{ env.INSTALL_DIR }}/samples_bin/*

samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,25 @@ int main(int argc, char* argv[]) try {
2020
std::string device = (argc == 4) ? argv[3] : "CPU"; // Default to CPU if no device is provided
2121

2222
ov::AnyMap ov_config;
23-
if (device == "NPU" || device.find("GPU") != std::string::npos) { // need to handle cases like "GPU", "GPU.0" and "GPU.1"
23+
if (device == "NPU" ||
24+
device.find("GPU") != std::string::npos) { // need to handle cases like "GPU", "GPU.0" and "GPU.1"
2425
// Cache compiled models on disk for GPU and NPU to save time on the
2526
// next run. It's not beneficial for CPU.
2627
ov_config = get_config_for_cache();
2728
}
2829

30+
// Word timestamps require decomposition of cross-attention decoder SDPA layers,
31+
// so word_timestamps must be passed to the pipeline constructor (not just in generation config)
32+
ov_config.insert(ov::genai::word_timestamps(true));
33+
2934
ov::genai::WhisperPipeline pipeline(models_path, device, ov_config);
3035

3136
ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config();
3237
// 'task' and 'language' parameters are supported for multilingual models only
3338
config.language = "<|en|>"; // can switch to <|zh|> for Chinese language
3439
config.task = "transcribe";
3540
config.return_timestamps = true;
41+
config.word_timestamps = true;
3642

3743
// Pipeline expects normalized audio with Sample Rate of 16kHz
3844
ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path);
@@ -45,6 +51,10 @@ int main(int argc, char* argv[]) try {
4551
std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
4652
}
4753

54+
for (auto& word : *result.words) {
55+
std::cout << "[" << word.start_ts << ", " << word.end_ts << "]: " << word.word << "\n";
56+
}
57+
4858
} catch (const std::exception& error) {
4959
try {
5060
std::cerr << error.what() << '\n';

samples/python/whisper_speech_recognition/whisper_speech_recognition.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@ def read_wav(filepath):
1111
raw_speech, samplerate = librosa.load(filepath, sr=16000)
1212
return raw_speech.tolist()
1313

14+
1415
def get_config_for_cache():
1516
config_cache = dict()
1617
config_cache["CACHE_DIR"] = "whisper_cache"
1718
return config_cache
1819

20+
1921
def main():
2022
parser = argparse.ArgumentParser()
2123
parser.add_argument("model_dir", help="Path to the model directory")
@@ -24,18 +26,23 @@ def main():
2426
args = parser.parse_args()
2527

2628
ov_config = dict()
27-
if args.device == "NPU" or "GPU" in args.device: # need to handle cases like "GPU", "GPU.0" and "GPU.1"
29+
if args.device == "NPU" or "GPU" in args.device: # need to handle cases like "GPU", "GPU.0" and "GPU.1"
2830
# Cache compiled models on disk for GPU and NPU to save time on the
2931
# next run. It's not beneficial for CPU.
3032
ov_config = get_config_for_cache()
3133

34+
# Word timestamps require decomposition of cross-attention decoder SDPA layers,
35+
# so word_timestamps must be passed to the pipeline constructor (not just in generation config)
36+
ov_config["word_timestamps"] = True
37+
3238
pipe = openvino_genai.WhisperPipeline(args.model_dir, args.device, **ov_config)
3339

3440
config = pipe.get_generation_config()
3541
# 'task' and 'language' parameters are supported for multilingual models only
3642
config.language = "<|en|>" # can switch to <|zh|> for Chinese language
3743
config.task = "transcribe"
3844
config.return_timestamps = True
45+
config.word_timestamps = True
3946

4047
# Pipeline expects normalized audio with Sample Rate of 16kHz
4148
raw_speech = read_wav(args.wav_file_path)
@@ -47,6 +54,10 @@ def main():
4754
for chunk in result.chunks:
4855
print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
4956

57+
if result.words:
58+
for word in result.words:
59+
print(f"[{word.start_ts:.2f}, {word.end_ts:.2f}]: {word.word}")
60+
5061

5162
if "__main__" == __name__:
5263
main()

site/docs/use-cases/speech-recognition/_sections/_usage_options/index.mdx

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,50 @@ Whisper can predict timestamps for each segment of speech, which is useful for s
204204
</TabItemCpp>
205205
</LanguageTabs>
206206

207+
### Word-level Timestamps Prediction
208+
209+
Whisper can predict timestamps for each word of speech, which provides more granular timing information compared to segment-level timestamps.
210+
211+
<LanguageTabs>
212+
<TabItemPython>
213+
```python
214+
# Word timestamps require decomposition of cross-attention decoder SDPA layers,
215+
# so word_timestamps must be passed to the pipeline constructor (not just in generation config)
216+
pipe = openvino_genai.WhisperPipeline(model_path, "CPU", word_timestamps=True)
217+
218+
# Enable word-level timestamp prediction
219+
result = pipe.generate(raw_speech, word_timestamps=True)
220+
221+
# Print word-level timestamps
222+
for word in result.words:
223+
print(f"[{word.start_ts:.2f}, {word.end_ts:.2f}]: {word.word}")
224+
```
225+
</TabItemPython>
226+
<TabItemCpp>
227+
```cpp
228+
int main() {
229+
// Word timestamps require decomposition of cross-attention decoder SDPA layers,
230+
// so word_timestamps must be passed to the pipeline constructor (not just in generation config)
231+
ov::genai::WhisperPipeline pipeline(model_path, "CPU", ov::genai::word_timestamps(true));
232+
233+
// Enable word-level timestamp prediction
234+
auto result = pipeline.generate(raw_speech, ov::genai::word_timestamps(true));
235+
236+
// Print word-level timestamps
237+
std::cout << std::fixed << std::setprecision(2);
238+
for (auto& word : *result.words) {
239+
std::cout << "[" << word.start_ts << ", " << word.end_ts << "]: " << word.word << "\n";
240+
}
241+
}
242+
```
243+
</TabItemCpp>
244+
</LanguageTabs>
245+
246+
:::info
247+
NPU device requires `STATIC_PIPELINE=True` property passed to `WhisperPipeline` constructor:
248+
`openvino_genai.WhisperPipeline(model_path, "NPU", word_timestamps=True, STATIC_PIPELINE=True)`
249+
:::
250+
207251
### Long-Form Audio Processing
208252

209253
Whisper models are designed for audio segments up to 30 seconds in length.

site/src/pages/_sections/UseCasesSection/components/speech-recognition.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ export const SpeechRecognition = () => (
1616
<UseCaseCard.Features>
1717
<li>Translate foreign language speech directly to English text</li>
1818
<li>Transcribe audio in multiple languages with automatic language detection</li>
19-
<li>Generate precise timestamps for synchronized subtitles and captions</li>
2019
<li>Process long-form audio content (&gt;30 seconds) efficiently</li>
20+
<li>Generate precise timestamps for synchronized subtitles and captions</li>
21+
<li>Generate word-level timestamps for detailed transcription</li>
2122
</UseCaseCard.Features>
2223
<UseCaseCard.Code>
2324
<LanguageTabs>
Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,38 @@
11
# Copyright (C) 2025 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import os
54
import pytest
65
import sys
6+
import re
77

88
from conftest import SAMPLES_PY_DIR, SAMPLES_CPP_DIR, SAMPLES_C_DIR
99
from test_utils import run_sample
1010

11+
12+
def filter_word_level_timestamps(text: str) -> str:
13+
"""
14+
example:
15+
How are you doing today?
16+
timestamps: [0.00, 2.00] text: How are you doing today?
17+
[0.00, 0.58]: How
18+
[0.58, 0.70]: are
19+
[0.70, 0.80]: you
20+
[0.80, 1.06]: doing
21+
[1.06, 1.40]: today?
22+
"""
23+
pattern = r"\[\d+\.\d{2}, \d+\.\d{2}\]:\s+\S+"
24+
filtered_text = re.sub(pattern, "", text).strip()
25+
return filtered_text
26+
27+
1128
class TestWhisperSpeechRecognition:
1229
@pytest.mark.whisper
1330
@pytest.mark.samples
1431
@pytest.mark.parametrize("convert_model", ["WhisperTiny"], indirect=True)
1532
@pytest.mark.parametrize("download_test_content", ["how_are_you_doing_today.wav"], indirect=True)
1633
def test_sample_whisper_speech_recognition(self, convert_model, download_test_content):
1734
# Run C++ sample
18-
cpp_sample = SAMPLES_CPP_DIR / 'whisper_speech_recognition'
35+
cpp_sample = SAMPLES_CPP_DIR / "whisper_speech_recognition"
1936
cpp_command = [cpp_sample, convert_model, download_test_content]
2037
cpp_result = run_sample(cpp_command)
2138

@@ -25,10 +42,14 @@ def test_sample_whisper_speech_recognition(self, convert_model, download_test_co
2542
py_result = run_sample(py_command)
2643

2744
# Run C sample
28-
c_sample = SAMPLES_C_DIR / 'whisper_speech_recognition_c'
45+
c_sample = SAMPLES_C_DIR / "whisper_speech_recognition_c"
2946
c_command = [c_sample, convert_model, download_test_content]
3047
c_result = run_sample(c_command)
3148

3249
# Compare results
3350
assert py_result.stdout == cpp_result.stdout, "Python and C++ results should match"
34-
assert py_result.stdout == c_result.stdout, "Python and C results should match"
51+
# C API has no word-level timestamps support which enabled in Python and CPP samples
52+
# ticket to enable C API: 180115
53+
assert filter_word_level_timestamps(py_result.stdout) == c_result.stdout.strip(), (
54+
"Python and C results should match without word-level timestamps"
55+
)

0 commit comments

Comments
 (0)