tazz4843 · tazz4843 · Aug 28, 2023 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/BUILDING.md b/BUILDING.md
@@ -57,3 +57,36 @@ brew install cmake
 ```
 
 CMake can also be installed from https://cmake.org/download/ but `cmake` binary needs to be in your PATH.
+
+# OpenVINO support
+
+## Development Tools
+OpenVINO support requires the OpenVINO Development Tools to be installed. You can find
+instructions for installing the OpenVINO Development Tools here:
+https://docs.openvino.ai/2023.0/openvino_docs_install_guides_install_dev_tools.html#for-c-developers
+
+On Arch Linux, you can install the OpenVINO Development Tools with the following command:
+```
+paru -S openvino
+```
+This build may take a significant amount of time, but can save massive headaches later on.
+
+## Building
+First, the `openvino` feature must be enabled in your Cargo.toml.
+
+Next, you must set the `OpenVINO_DIR` environment variable to the path where CMake can find
+`OpenVINOConfig.cmake`.
+This is usually in the `cmake` directory of the OpenVINO installation.
+
+If you used the AUR package to install OpenVINO, the location of this file is `/opt/intel/openvino/runtime/cmake`.
+
+```
+export OpenVINO_DIR=/opt/intel/openvino/runtime/cmake
+```
+
+Finally, you can build whisper-rs as normal.
+
+## Tested platforms
+- Arch Linux
+
+If you have successfully built whisper-rs with OpenVINO on another platform, please open a PR to document it here!
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [workspace]
 members = ["sys"]
-exclude = ["examples/full_usage"]
+exclude = ["examples/examples_common", "examples/full_usage", "examples/openvino_usage"]
 
 [package]
 name = "whisper-rs"
@@ -34,6 +34,7 @@ metal = ["whisper-rs-sys/metal", "_gpu"]
 vulkan = ["whisper-rs-sys/vulkan", "_gpu"]
 openmp = ["whisper-rs-sys/openmp"]
 _gpu = []
+openvino = ["whisper-rs-sys/openvino"]
 test-with-tiny-model = []
 
 # Bring logs into Rust via the log crate. *Warning*: not mutually exclusive with tracing_backend,
@@ -43,3 +44,7 @@ log_backend = ["dep:log"]
 # Bring logs into Rust via the tracing crate. *Warning*: not mutually exclusive with log_backend,
 # will result in duplicate logs if both are enabled and one consumes logs from the other.
 tracing_backend = ["dep:tracing"]
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
diff --git a/README.md b/README.md
@@ -80,7 +80,9 @@ All disabled by default unless otherwise specified.
 
 ## Building
 
-See [BUILDING.md](BUILDING.md) for instructions for building whisper-rs on Windows and OSX M1. Linux builds should just
+See [BUILDING.md](BUILDING.md) for instructions for building whisper-rs on Windows and OSX M1,
+or with OpenVINO on any OS.
+Besides OpenVINO, Linux builds should just
 work out of the box.
 
 ## Troubleshooting

diff --git a/examples/examples_common/Cargo.toml b/examples/examples_common/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "examples-common"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+hound = "3"
diff --git a/examples/examples_common/src/lib.rs b/examples/examples_common/src/lib.rs
@@ -0,0 +1,24 @@
+use hound::{SampleFormat, WavReader};
+use std::path::Path;
+
+pub fn parse_wav_file(path: &Path) -> Vec<i16> {
+    let reader = WavReader::open(path).expect("failed to read file");
+
+    if reader.spec().channels != 1 {
+        panic!("expected mono audio file");
+    }
+    if reader.spec().sample_format != SampleFormat::Int {
+        panic!("expected integer sample format");
+    }
+    if reader.spec().sample_rate != 16000 {
+        panic!("expected 16KHz sample rate");
+    }
+    if reader.spec().bits_per_sample != 16 {
+        panic!("expected 16 bits per sample");
+    }
+
+    reader
+        .into_samples::<i16>()
+        .map(|x| x.expect("sample"))
+        .collect::<Vec<_>>()
+}
diff --git a/examples/full_usage/2830-3980-0043.wav b/examples/full_usage/2830-3980-0043.wav
diff --git a/examples/full_usage/Cargo.toml b/examples/full_usage/Cargo.toml
@@ -1,10 +1,8 @@
 [package]
 name = "full_usage"
 version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+edition = "2024"
 
 [dependencies]
-hound = "3"
 whisper-rs = { path = "../.." }
+examples-common = { path = "../examples_common" }
diff --git a/examples/full_usage/src/main.rs b/examples/full_usage/src/main.rs
@@ -1,31 +1,8 @@
 #![allow(clippy::uninlined_format_args)]
 
-use hound::{SampleFormat, WavReader};
 use std::path::Path;
 use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
 
-fn parse_wav_file(path: &Path) -> Vec<i16> {
-    let reader = WavReader::open(path).expect("failed to read file");
-
-    if reader.spec().channels != 1 {
-        panic!("expected mono audio file");
-    }
-    if reader.spec().sample_format != SampleFormat::Int {
-        panic!("expected integer sample format");
-    }
-    if reader.spec().sample_rate != 16000 {
-        panic!("expected 16KHz sample rate");
-    }
-    if reader.spec().bits_per_sample != 16 {
-        panic!("expected 16 bits per sample");
-    }
-
-    reader
-        .into_samples::<i16>()
-        .map(|x| x.expect("sample"))
-        .collect::<Vec<_>>()
-}
-
 fn main() {
     let arg1 = std::env::args()
         .nth(1)
@@ -42,7 +19,7 @@ fn main() {
         panic!("whisper file doesn't exist")
     }
 
-    let original_samples = parse_wav_file(audio_path);
+    let original_samples = examples_common::parse_wav_file(audio_path);
     let mut samples = vec![0.0f32; original_samples.len()];
     whisper_rs::convert_integer_to_float_audio(&original_samples, &mut samples)
         .expect("failed to convert samples");

diff --git a/examples/openvino_usage/Cargo.toml b/examples/openvino_usage/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "openvino_usage"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+whisper-rs = { path = "../..", features = ["openvino"] }
+examples-common = { path = "../examples_common" }
diff --git a/examples/openvino_usage/README.md b/examples/openvino_usage/README.md
@@ -0,0 +1,41 @@
+# OpenVINO Usage Example
+
+Run `cargo build --release` in this directory,
+then `./target/release/openvino_usage ../examples_common/2830-3890-0043.wav /path/to/ggml-model.bin`
+
+There should be an OpenVINO file associated with the model next to it,
+otherwise you will get an error at runtime.
+
+## Getting your paws on OpenVINO data
+
+Unfortunately there's no downloads of OpenVINO state. The only way to get it is generating it.
+
+Example for most Linux distros (run this from the current directory):
+
+```bash
+cd ../..
+
+# We need to pull in whisper.cpp.
+# This should've already been done when you cloned the repo, but let's be sure.
+git submodule update --init --recursive
+
+cd sys/whisper.cpp/models/
+
+# Generate a new venv and install the required things.
+# This might take a bit, grab a drink.
+# (yes this installs CUDA even if you don't have a Nvidia GPU, enjoy your 6GB venv setup)
+python3.12 -m venv venv
+source venv/bin/activate
+python3 -m pip install -U pip
+python3 -m pip install -r requirements-openvino.txt
+
+# This is the key line. Change base as necessary to the name of the model you want.
+python3 convert-whisper-to-openvino.py --model base
+```
+
+Do note a line that states
+`assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"`
+is not fatal.
+The output file will still be generated normally.
+
+See upstream's README for more info: https://github.com/ggerganov/whisper.cpp/#openvino-support
diff --git a/examples/openvino_usage/src/main.rs b/examples/openvino_usage/src/main.rs
@@ -0,0 +1,66 @@
+#![allow(clippy::uninlined_format_args)]
+
+use std::path::Path;
+use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
+
+fn main() {
+    let arg1 = std::env::args()
+        .nth(1)
+        .expect("first argument should be path to WAV file");
+    let audio_path = Path::new(&arg1);
+    if !audio_path.exists() {
+        panic!("audio file doesn't exist");
+    }
+    let arg2 = std::env::args()
+        .nth(2)
+        .expect("second argument should be path to Whisper model");
+    let whisper_path = Path::new(&arg2);
+    if !whisper_path.exists() {
+        panic!("whisper file doesn't exist")
+    }
+
+    let original_samples = examples_common::parse_wav_file(audio_path);
+    let mut samples = vec![0.0f32; original_samples.len()];
+    whisper_rs::convert_integer_to_float_audio(&original_samples, &mut samples)
+        .expect("failed to convert samples");
+
+    let ctx = WhisperContext::new_with_params(
+        &whisper_path.to_string_lossy(),
+        WhisperContextParameters::default(),
+    )
+    .expect("failed to open model");
+    let mut state = ctx.create_state().expect("failed to create a model state");
+
+    // Enable OpenVINO now
+    // We're expecting the OpenVINO file sitting right next to the model
+    state
+        .init_openvino_encoder(None, "GPU", None)
+        .expect("failed to enable openvino");
+
+    let mut params = FullParams::new(SamplingStrategy::default());
+    params.set_initial_prompt("experience");
+    params.set_progress_callback_safe(|progress| println!("Progress callback: {}%", progress));
+
+    let st = std::time::Instant::now();
+    state
+        .full(params, &samples)
+        .expect("failed to convert samples");
+    let et = std::time::Instant::now();
+
+    let num_segments = state
+        .full_n_segments()
+        .expect("failed to get number of segments");
+    for i in 0..num_segments {
+        let segment = state
+            .full_get_segment_text(i)
+            .expect("failed to get segment");
+        let start_timestamp = state
+            .full_get_segment_t0(i)
+            .expect("failed to get start timestamp");
+        let end_timestamp = state
+            .full_get_segment_t1(i)
+            .expect("failed to get end timestamp");
+        println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
+    }
+    println!("took {}ms", (et - st).as_millis());
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,5 +1,6 @@
 #![allow(clippy::uninlined_format_args)]
 #![cfg_attr(test, feature(test))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 
 mod common_logging;
 mod error;

diff --git a/src/whisper_ctx.rs b/src/whisper_ctx.rs
@@ -70,6 +70,8 @@ impl WhisperInnerContext {
         }
     }
 
+    // we don't implement `whisper_init()` here since i have zero clue what `whisper_model_loader` does
+
     /// Convert the provided text into tokens.
     ///
     /// # Arguments

diff --git a/src/whisper_state.rs b/src/whisper_state.rs
@@ -1,4 +1,4 @@
-use std::ffi::{c_int, CStr};
+use std::ffi::{c_int, CStr, CString};
 use std::sync::Arc;
 
 use crate::{FullParams, WhisperError, WhisperInnerContext, WhisperToken, WhisperTokenData};
@@ -30,6 +30,68 @@ impl WhisperState {
         Self { ctx, ptr }
     }
 
+    /// Using this context, enable use of OpenVINO for encoder inference.
+    ///
+    /// # Arguments
+    /// * `model_path`: An optional path to the OpenVINO encoder IR model.
+    ///   Setting this to [`None`] will default to a file next to the path
+    ///   passed in to [`crate::WhisperContext::new_with_params`].
+    ///   For example, if the model path was `/path/to/ggml-base.en.bin`,
+    ///   then the OpenVINO IR model path will default to `/path/to/ggml-base.en-encoder-openvino.xml`.
+    ///
+    /// * `device`: The OpenVINO device to use for inference (e.g. `CPU`, `GPU`).
+    ///   This is a string, as OpenVINO can randomly add new devices,
+    ///   and having a hardcoded enum would result in major issues if one tried
+    ///   to use a new device with an outdated enum.
+    ///   Chances are unless you're doing something special, you just want `GPU` for this string.
+    ///
+    /// * `cache_dir`: Optional cache directory that can speed up init time,
+    ///   especially for GPU, by caching compiled 'blobs' in it.
+    ///   Setting this to [`None`] will default to placing this directory next to the model path.
+    ///   For example, if the model path was `/path/to/ggml-base.en.bin`,
+    ///   then the cache dir model path will default to `/path/to/ggml-base.en-encoder-openvino-cache`.
+    ///
+    /// **Note**: if you called [`crate::WhisperContext::new_from_buffer_with_params`], and either
+    /// `model_path` or `cache_dir` is None, this function will fail to initialize,
+    /// as there's no path for it to initialize from.
+    ///
+    /// # Returns
+    /// `Ok(())` if no error.
+    ///
+    /// `Err(WhisperError::GenericError(ret))` on error,
+    /// where ret is the return value from `whisper.cpp` (as of writing, this will always be 1).
+    /// Checking output logs for the actual error is more productive
+    /// than looking at the contained value.
+    ///
+    /// # C++ equivalent
+    /// `int whisper_ctx_init_openvino_encoder(struct whisper_context * ctx, const char * model_path, const char * device, const char * cache_dir);`
+    #[cfg(feature = "openvino")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "openvino")))]
+    pub fn init_openvino_encoder(
+        &mut self,
+        model_path: Option<&str>,
+        device: &str,
+        cache_dir: Option<&str>,
+    ) -> Result<(), WhisperError> {
+        let model_path = model_path.map(|s| CString::new(s).unwrap());
+        let device = CString::new(device).unwrap();
+        let cache_dir = cache_dir.map(|s| CString::new(s).unwrap());
+        let ret = unsafe {
+            whisper_rs_sys::whisper_ctx_init_openvino_encoder_with_state(
+                self.ctx.ctx,
+                self.ptr,
+                model_path.map_or_else(std::ptr::null, |s| s.as_ptr()),
+                device.as_ptr(),
+                cache_dir.map_or_else(std::ptr::null, |s| s.as_ptr()),
+            )
+        };
+        if ret == 0 {
+            Ok(())
+        } else {
+            Err(WhisperError::GenericError(ret))
+        }
+    }
+
     /// Convert raw PCM audio (floating point 32 bit) to log mel spectrogram.
     /// The resulting spectrogram is stored in the context transparently.
     ///

diff --git a/sys/Cargo.toml b/sys/Cargo.toml
@@ -35,6 +35,9 @@ metal = []
 vulkan = []
 force-debug = []
 openmp = []
+openvino = []
+
+[dependencies]
 
 [build-dependencies]
 cmake = "0.1"
-Original file line number
+Diff line change
@@ Expand Up / @@ -70,6 +70,8 @@ impl WhisperInnerContext { @@
             }
         }
+        // we don't implement `whisper_init()` here since i have zero clue what `whisper_model_loader` does
         /// Convert the provided text into tokens.
         ///
         /// # Arguments
@@ Expand Down @@