diff --git a/BUILDING.md b/BUILDING.md index 0be1f8aa..35bff87d 100644 --- a/BUILDING.md +++ b/BUILDING.md @@ -57,3 +57,36 @@ brew install cmake ``` CMake can also be installed from https://cmake.org/download/ but `cmake` binary needs to be in your PATH. + +# OpenVINO support + +## Development Tools +OpenVINO support requires the OpenVINO Development Tools to be installed. You can find +instructions for installing the OpenVINO Development Tools here: +https://docs.openvino.ai/2023.0/openvino_docs_install_guides_install_dev_tools.html#for-c-developers + +On Arch Linux, you can install the OpenVINO Development Tools with the following command: +``` +paru -S openvino +``` +This build may take a significant amount of time, but can save massive headaches later on. + +## Building +First, the `openvino` feature must be enabled in your Cargo.toml. + +Next, you must set the `OpenVINO_DIR` environment variable to the path where CMake can find +`OpenVINOConfig.cmake`. +This is usually in the `cmake` directory of the OpenVINO installation. + +If you used the AUR package to install OpenVINO, the location of this file is `/opt/intel/openvino/runtime/cmake`. + +``` +export OpenVINO_DIR=/opt/intel/openvino/runtime/cmake +``` + +Finally, you can build whisper-rs as normal. + +## Tested platforms +- Arch Linux + +If you have successfully built whisper-rs with OpenVINO on another platform, please open a PR to document it here! \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 3d170c1e..b0ab54f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] members = ["sys"] -exclude = ["examples/full_usage"] +exclude = ["examples/examples_common", "examples/full_usage", "examples/openvino_usage"] [package] name = "whisper-rs" @@ -34,6 +34,7 @@ metal = ["whisper-rs-sys/metal", "_gpu"] vulkan = ["whisper-rs-sys/vulkan", "_gpu"] openmp = ["whisper-rs-sys/openmp"] _gpu = [] +openvino = ["whisper-rs-sys/openvino"] test-with-tiny-model = [] # Bring logs into Rust via the log crate. *Warning*: not mutually exclusive with tracing_backend, @@ -43,3 +44,7 @@ log_backend = ["dep:log"] # Bring logs into Rust via the tracing crate. *Warning*: not mutually exclusive with log_backend, # will result in duplicate logs if both are enabled and one consumes logs from the other. tracing_backend = ["dep:tracing"] + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] diff --git a/README.md b/README.md index fb16a90f..f39fd53d 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,9 @@ All disabled by default unless otherwise specified. ## Building -See [BUILDING.md](BUILDING.md) for instructions for building whisper-rs on Windows and OSX M1. Linux builds should just +See [BUILDING.md](BUILDING.md) for instructions for building whisper-rs on Windows and OSX M1, +or with OpenVINO on any OS. +Besides OpenVINO, Linux builds should just work out of the box. ## Troubleshooting diff --git a/examples/examples_common/Cargo.toml b/examples/examples_common/Cargo.toml new file mode 100644 index 00000000..65ecd47a --- /dev/null +++ b/examples/examples_common/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "examples-common" +version = "0.1.0" +edition = "2024" + +[dependencies] +hound = "3" diff --git a/examples/examples_common/src/lib.rs b/examples/examples_common/src/lib.rs new file mode 100644 index 00000000..147399fc --- /dev/null +++ b/examples/examples_common/src/lib.rs @@ -0,0 +1,24 @@ +use hound::{SampleFormat, WavReader}; +use std::path::Path; + +pub fn parse_wav_file(path: &Path) -> Vec { + let reader = WavReader::open(path).expect("failed to read file"); + + if reader.spec().channels != 1 { + panic!("expected mono audio file"); + } + if reader.spec().sample_format != SampleFormat::Int { + panic!("expected integer sample format"); + } + if reader.spec().sample_rate != 16000 { + panic!("expected 16KHz sample rate"); + } + if reader.spec().bits_per_sample != 16 { + panic!("expected 16 bits per sample"); + } + + reader + .into_samples::() + .map(|x| x.expect("sample")) + .collect::>() +} diff --git a/examples/full_usage/2830-3980-0043.wav b/examples/full_usage/2830-3980-0043.wav deleted file mode 100644 index d6b84631..00000000 Binary files a/examples/full_usage/2830-3980-0043.wav and /dev/null differ diff --git a/examples/full_usage/Cargo.toml b/examples/full_usage/Cargo.toml index 97020b06..6b31bb57 100644 --- a/examples/full_usage/Cargo.toml +++ b/examples/full_usage/Cargo.toml @@ -1,10 +1,8 @@ [package] name = "full_usage" version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +edition = "2024" [dependencies] -hound = "3" whisper-rs = { path = "../.." } +examples-common = { path = "../examples_common" } diff --git a/examples/full_usage/src/main.rs b/examples/full_usage/src/main.rs index c3352fd5..9f09cfbc 100644 --- a/examples/full_usage/src/main.rs +++ b/examples/full_usage/src/main.rs @@ -1,31 +1,8 @@ #![allow(clippy::uninlined_format_args)] -use hound::{SampleFormat, WavReader}; use std::path::Path; use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; -fn parse_wav_file(path: &Path) -> Vec { - let reader = WavReader::open(path).expect("failed to read file"); - - if reader.spec().channels != 1 { - panic!("expected mono audio file"); - } - if reader.spec().sample_format != SampleFormat::Int { - panic!("expected integer sample format"); - } - if reader.spec().sample_rate != 16000 { - panic!("expected 16KHz sample rate"); - } - if reader.spec().bits_per_sample != 16 { - panic!("expected 16 bits per sample"); - } - - reader - .into_samples::() - .map(|x| x.expect("sample")) - .collect::>() -} - fn main() { let arg1 = std::env::args() .nth(1) @@ -42,7 +19,7 @@ fn main() { panic!("whisper file doesn't exist") } - let original_samples = parse_wav_file(audio_path); + let original_samples = examples_common::parse_wav_file(audio_path); let mut samples = vec![0.0f32; original_samples.len()]; whisper_rs::convert_integer_to_float_audio(&original_samples, &mut samples) .expect("failed to convert samples"); diff --git a/examples/openvino_usage/Cargo.toml b/examples/openvino_usage/Cargo.toml new file mode 100644 index 00000000..4dbb3c6b --- /dev/null +++ b/examples/openvino_usage/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "openvino_usage" +version = "0.1.0" +edition = "2024" + +[dependencies] +whisper-rs = { path = "../..", features = ["openvino"] } +examples-common = { path = "../examples_common" } diff --git a/examples/openvino_usage/README.md b/examples/openvino_usage/README.md new file mode 100644 index 00000000..49eb69b0 --- /dev/null +++ b/examples/openvino_usage/README.md @@ -0,0 +1,41 @@ +# OpenVINO Usage Example + +Run `cargo build --release` in this directory, +then `./target/release/openvino_usage ../examples_common/2830-3890-0043.wav /path/to/ggml-model.bin` + +There should be an OpenVINO file associated with the model next to it, +otherwise you will get an error at runtime. + +## Getting your paws on OpenVINO data + +Unfortunately there's no downloads of OpenVINO state. The only way to get it is generating it. + +Example for most Linux distros (run this from the current directory): + +```bash +cd ../.. + +# We need to pull in whisper.cpp. +# This should've already been done when you cloned the repo, but let's be sure. +git submodule update --init --recursive + +cd sys/whisper.cpp/models/ + +# Generate a new venv and install the required things. +# This might take a bit, grab a drink. +# (yes this installs CUDA even if you don't have a Nvidia GPU, enjoy your 6GB venv setup) +python3.12 -m venv venv +source venv/bin/activate +python3 -m pip install -U pip +python3 -m pip install -r requirements-openvino.txt + +# This is the key line. Change base as necessary to the name of the model you want. +python3 convert-whisper-to-openvino.py --model base +``` + +Do note a line that states +`assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"` +is not fatal. +The output file will still be generated normally. + +See upstream's README for more info: https://github.com/ggerganov/whisper.cpp/#openvino-support diff --git a/examples/openvino_usage/src/main.rs b/examples/openvino_usage/src/main.rs new file mode 100644 index 00000000..d24f3f4f --- /dev/null +++ b/examples/openvino_usage/src/main.rs @@ -0,0 +1,66 @@ +#![allow(clippy::uninlined_format_args)] + +use std::path::Path; +use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; + +fn main() { + let arg1 = std::env::args() + .nth(1) + .expect("first argument should be path to WAV file"); + let audio_path = Path::new(&arg1); + if !audio_path.exists() { + panic!("audio file doesn't exist"); + } + let arg2 = std::env::args() + .nth(2) + .expect("second argument should be path to Whisper model"); + let whisper_path = Path::new(&arg2); + if !whisper_path.exists() { + panic!("whisper file doesn't exist") + } + + let original_samples = examples_common::parse_wav_file(audio_path); + let mut samples = vec![0.0f32; original_samples.len()]; + whisper_rs::convert_integer_to_float_audio(&original_samples, &mut samples) + .expect("failed to convert samples"); + + let ctx = WhisperContext::new_with_params( + &whisper_path.to_string_lossy(), + WhisperContextParameters::default(), + ) + .expect("failed to open model"); + let mut state = ctx.create_state().expect("failed to create a model state"); + + // Enable OpenVINO now + // We're expecting the OpenVINO file sitting right next to the model + state + .init_openvino_encoder(None, "GPU", None) + .expect("failed to enable openvino"); + + let mut params = FullParams::new(SamplingStrategy::default()); + params.set_initial_prompt("experience"); + params.set_progress_callback_safe(|progress| println!("Progress callback: {}%", progress)); + + let st = std::time::Instant::now(); + state + .full(params, &samples) + .expect("failed to convert samples"); + let et = std::time::Instant::now(); + + let num_segments = state + .full_n_segments() + .expect("failed to get number of segments"); + for i in 0..num_segments { + let segment = state + .full_get_segment_text(i) + .expect("failed to get segment"); + let start_timestamp = state + .full_get_segment_t0(i) + .expect("failed to get start timestamp"); + let end_timestamp = state + .full_get_segment_t1(i) + .expect("failed to get end timestamp"); + println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment); + } + println!("took {}ms", (et - st).as_millis()); +} diff --git a/src/lib.rs b/src/lib.rs index a6632a94..e53c4852 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ #![allow(clippy::uninlined_format_args)] #![cfg_attr(test, feature(test))] +#![cfg_attr(docsrs, feature(doc_cfg))] mod common_logging; mod error; diff --git a/src/whisper_ctx.rs b/src/whisper_ctx.rs index 0e7be5cb..95afa445 100644 --- a/src/whisper_ctx.rs +++ b/src/whisper_ctx.rs @@ -70,6 +70,8 @@ impl WhisperInnerContext { } } + // we don't implement `whisper_init()` here since i have zero clue what `whisper_model_loader` does + /// Convert the provided text into tokens. /// /// # Arguments diff --git a/src/whisper_state.rs b/src/whisper_state.rs index 22418eee..0505bcde 100644 --- a/src/whisper_state.rs +++ b/src/whisper_state.rs @@ -1,4 +1,4 @@ -use std::ffi::{c_int, CStr}; +use std::ffi::{c_int, CStr, CString}; use std::sync::Arc; use crate::{FullParams, WhisperError, WhisperInnerContext, WhisperToken, WhisperTokenData}; @@ -30,6 +30,68 @@ impl WhisperState { Self { ctx, ptr } } + /// Using this context, enable use of OpenVINO for encoder inference. + /// + /// # Arguments + /// * `model_path`: An optional path to the OpenVINO encoder IR model. + /// Setting this to [`None`] will default to a file next to the path + /// passed in to [`crate::WhisperContext::new_with_params`]. + /// For example, if the model path was `/path/to/ggml-base.en.bin`, + /// then the OpenVINO IR model path will default to `/path/to/ggml-base.en-encoder-openvino.xml`. + /// + /// * `device`: The OpenVINO device to use for inference (e.g. `CPU`, `GPU`). + /// This is a string, as OpenVINO can randomly add new devices, + /// and having a hardcoded enum would result in major issues if one tried + /// to use a new device with an outdated enum. + /// Chances are unless you're doing something special, you just want `GPU` for this string. + /// + /// * `cache_dir`: Optional cache directory that can speed up init time, + /// especially for GPU, by caching compiled 'blobs' in it. + /// Setting this to [`None`] will default to placing this directory next to the model path. + /// For example, if the model path was `/path/to/ggml-base.en.bin`, + /// then the cache dir model path will default to `/path/to/ggml-base.en-encoder-openvino-cache`. + /// + /// **Note**: if you called [`crate::WhisperContext::new_from_buffer_with_params`], and either + /// `model_path` or `cache_dir` is None, this function will fail to initialize, + /// as there's no path for it to initialize from. + /// + /// # Returns + /// `Ok(())` if no error. + /// + /// `Err(WhisperError::GenericError(ret))` on error, + /// where ret is the return value from `whisper.cpp` (as of writing, this will always be 1). + /// Checking output logs for the actual error is more productive + /// than looking at the contained value. + /// + /// # C++ equivalent + /// `int whisper_ctx_init_openvino_encoder(struct whisper_context * ctx, const char * model_path, const char * device, const char * cache_dir);` + #[cfg(feature = "openvino")] + #[cfg_attr(docsrs, doc(cfg(feature = "openvino")))] + pub fn init_openvino_encoder( + &mut self, + model_path: Option<&str>, + device: &str, + cache_dir: Option<&str>, + ) -> Result<(), WhisperError> { + let model_path = model_path.map(|s| CString::new(s).unwrap()); + let device = CString::new(device).unwrap(); + let cache_dir = cache_dir.map(|s| CString::new(s).unwrap()); + let ret = unsafe { + whisper_rs_sys::whisper_ctx_init_openvino_encoder_with_state( + self.ctx.ctx, + self.ptr, + model_path.map_or_else(std::ptr::null, |s| s.as_ptr()), + device.as_ptr(), + cache_dir.map_or_else(std::ptr::null, |s| s.as_ptr()), + ) + }; + if ret == 0 { + Ok(()) + } else { + Err(WhisperError::GenericError(ret)) + } + } + /// Convert raw PCM audio (floating point 32 bit) to log mel spectrogram. /// The resulting spectrogram is stored in the context transparently. /// diff --git a/sys/Cargo.toml b/sys/Cargo.toml index c97c9a28..78b376f7 100644 --- a/sys/Cargo.toml +++ b/sys/Cargo.toml @@ -35,6 +35,9 @@ metal = [] vulkan = [] force-debug = [] openmp = [] +openvino = [] + +[dependencies] [build-dependencies] cmake = "0.1" diff --git a/sys/build.rs b/sys/build.rs index bd3b0da3..719fb913 100644 --- a/sys/build.rs +++ b/sys/build.rs @@ -9,6 +9,25 @@ use std::io::{BufRead, BufReader}; use std::path::PathBuf; fn main() { + // Fail-fast test for OpenVINO + #[cfg(feature = "openvino")] + { + let openvino_dir = + env::var("OpenVINO_DIR").unwrap_or_else(|_| String::from("/usr/lib/cmake/openvino/")); + // see if we can find OpenVINOConfig.cmake + let openvino_config_path = PathBuf::from(&openvino_dir).join("OpenVINOConfig.cmake"); + if !openvino_config_path.exists() { + panic!( + "Couldn't find OpenVINOConfig.cmake in OpenVINO_DIR (currently set to {}). Please set it to the path where `OpenVINOConfig.cmake` can be found.\n\ + On Arch Linux, if you installed the AUR package, this path is `/usr/lib/cmake/openvino/`.", + openvino_dir + ); + } + + // exists so be sure to reexport it + unsafe { env::set_var("OpenVINO_DIR", openvino_dir) } + } + let target = env::var("TARGET").unwrap(); // Link C++ standard library if let Some(cpp_stdlib) = get_cpp_link_stdlib(&target) { @@ -186,6 +205,8 @@ fn main() { config.define("AMDGPU_TARGETS", gpu_targets); } } + #[cfg(feature = "openvino")] + config.define("WHISPER_OPENVINO", "1"); if cfg!(feature = "vulkan") { config.define("GGML_VULKAN", "ON");