Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ openai = ["dep:async-openai", "dep:tokio", "dep:async-trait"]
# Silero neural VAD (requires silero_vad_v4.onnx model file)
vad-silero = ["dep:ort", "dep:ndarray"]

# Neural punctuation restoration (CT-Transformer)
punct = ["dep:ort", "dep:ndarray"]

# --- ORT Accelerators ---
# Note: ort-cuda pulls in the CUDA execution provider, which adds ~800 MB+
# to the ORT binary and requires a CUDA toolkit / cuDNN installation at runtime.
Expand All @@ -44,7 +47,7 @@ ort-webgpu = ["onnx", "ort/webgpu"]
ort-accel = ["ort-cuda", "ort-directml", "ort-rocm", "ort-coreml", "ort-webgpu"]

# Convenience
all = ["onnx", "whisper-cpp", "whisperfile", "openai"]
all = ["onnx", "whisper-cpp", "whisperfile", "openai", "punct"]

[dependencies]
# Always required
Expand Down Expand Up @@ -115,6 +118,18 @@ required-features = ["whisperfile"]
name = "openai"
required-features = ["openai"]

[[example]]
name = "paraformer"
required-features = ["onnx"]

[[example]]
name = "zipformer_ctc"
required-features = ["onnx"]

[[example]]
name = "zipformer_transducer"
required-features = ["onnx"]

[dev-dependencies]
once_cell = "1.21.3"

Expand Down Expand Up @@ -158,3 +173,15 @@ required-features = ["onnx", "vad-silero"]
[[test]]
name = "vad_silero"
required-features = ["vad-silero"]

[[test]]
name = "paraformer"
required-features = ["onnx"]

[[test]]
name = "zipformer_ctc"
required-features = ["onnx"]

[[test]]
name = "zipformer_transducer"
required-features = ["onnx"]
87 changes: 87 additions & 0 deletions examples/paraformer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use std::path::PathBuf;
use std::time::Instant;

use transcribe_rs::onnx::paraformer::ParaformerModel;
use transcribe_rs::onnx::Quantization;
use transcribe_rs::SpeechModel;

fn get_audio_duration(path: &PathBuf) -> Result<f64, Box<dyn std::error::Error>> {
let reader = hound::WavReader::open(path)?;
let spec = reader.spec();
let duration = reader.duration() as f64 / spec.sample_rate as f64;
Ok(duration)
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init();

let args: Vec<String> = std::env::args().collect();
let positional: Vec<&String> = args
.iter()
.skip(1)
.filter(|a| !a.starts_with("--"))
.collect();

let int8 = args.iter().any(|a| a == "--int8");
let model_path = PathBuf::from(
positional
.first()
.map(|s| s.as_str())
.unwrap_or("models/sherpa-onnx-paraformer-zh-2025-10-07"),
);
let wav_path = PathBuf::from(
positional
.get(1)
.map(|s| s.as_str())
.unwrap_or("samples/zh.wav"),
);

let audio_duration = get_audio_duration(&wav_path)?;
println!("Audio duration: {:.2}s", audio_duration);

let quantization = if int8 {
Quantization::Int8
} else {
Quantization::FP32
};

println!("Using Paraformer engine");
println!(
"Loading model: {:?} (quantization: {})",
model_path,
if int8 { "int8" } else { "fp32" }
);

let load_start = Instant::now();
let mut model = ParaformerModel::load(&model_path, &quantization)?;
let load_duration = load_start.elapsed();
println!("Model loaded in {:.2?}", load_duration);

println!("Transcribing file: {:?}", wav_path);
let transcribe_start = Instant::now();

let result = model.transcribe_file(&wav_path, &transcribe_rs::TranscribeOptions::default())?;
let transcribe_duration = transcribe_start.elapsed();
println!("Transcription completed in {:.2?}", transcribe_duration);

let speedup_factor = audio_duration / transcribe_duration.as_secs_f64();
println!(
"Real-time speedup: {:.2}x faster than real-time",
speedup_factor
);

println!("Transcription result:");
println!("{}", result.text);

if let Some(segments) = result.segments {
println!("\nSegments:");
for segment in segments {
println!(
"[{:.2}s - {:.2}s]: {}",
segment.start, segment.end, segment.text
);
}
}

Ok(())
}
87 changes: 87 additions & 0 deletions examples/zipformer_ctc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use std::path::PathBuf;
use std::time::Instant;

use transcribe_rs::onnx::zipformer_ctc::ZipformerCtcModel;
use transcribe_rs::onnx::Quantization;
use transcribe_rs::SpeechModel;

fn get_audio_duration(path: &PathBuf) -> Result<f64, Box<dyn std::error::Error>> {
let reader = hound::WavReader::open(path)?;
let spec = reader.spec();
let duration = reader.duration() as f64 / spec.sample_rate as f64;
Ok(duration)
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init();

let args: Vec<String> = std::env::args().collect();
let positional: Vec<&String> = args
.iter()
.skip(1)
.filter(|a| !a.starts_with("--"))
.collect();

let int8 = args.iter().any(|a| a == "--int8");
let model_path = PathBuf::from(
positional
.first()
.map(|s| s.as_str())
.unwrap_or("models/sherpa-onnx-zipformer-ctc-small-zh-int8-2025-07-16"),
);
let wav_path = PathBuf::from(
positional
.get(1)
.map(|s| s.as_str())
.unwrap_or("samples/zh.wav"),
);

let audio_duration = get_audio_duration(&wav_path)?;
println!("Audio duration: {:.2}s", audio_duration);

let quantization = if int8 {
Quantization::Int8
} else {
Quantization::FP32
};

println!("Using Zipformer CTC engine");
println!(
"Loading model: {:?} (quantization: {})",
model_path,
if int8 { "int8" } else { "fp32" }
);

let load_start = Instant::now();
let mut model = ZipformerCtcModel::load(&model_path, &quantization)?;
let load_duration = load_start.elapsed();
println!("Model loaded in {:.2?}", load_duration);

println!("Transcribing file: {:?}", wav_path);
let transcribe_start = Instant::now();

let result = model.transcribe_file(&wav_path, &transcribe_rs::TranscribeOptions::default())?;
let transcribe_duration = transcribe_start.elapsed();
println!("Transcription completed in {:.2?}", transcribe_duration);

let speedup_factor = audio_duration / transcribe_duration.as_secs_f64();
println!(
"Real-time speedup: {:.2}x faster than real-time",
speedup_factor
);

println!("Transcription result:");
println!("{}", result.text);

if let Some(segments) = result.segments {
println!("\nSegments:");
for segment in segments {
println!(
"[{:.2}s - {:.2}s]: {}",
segment.start, segment.end, segment.text
);
}
}

Ok(())
}
87 changes: 87 additions & 0 deletions examples/zipformer_transducer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use std::path::PathBuf;
use std::time::Instant;

use transcribe_rs::onnx::zipformer_transducer::ZipformerTransducerModel;
use transcribe_rs::onnx::Quantization;
use transcribe_rs::SpeechModel;

fn get_audio_duration(path: &PathBuf) -> Result<f64, Box<dyn std::error::Error>> {
let reader = hound::WavReader::open(path)?;
let spec = reader.spec();
let duration = reader.duration() as f64 / spec.sample_rate as f64;
Ok(duration)
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init();

let args: Vec<String> = std::env::args().collect();
let positional: Vec<&String> = args
.iter()
.skip(1)
.filter(|a| !a.starts_with("--"))
.collect();

let int8 = args.iter().any(|a| a == "--int8");
let model_path = PathBuf::from(
positional
.first()
.map(|s| s.as_str())
.unwrap_or("models/sherpa-onnx-zipformer-zh-en-2023-11-22"),
);
let wav_path = PathBuf::from(
positional
.get(1)
.map(|s| s.as_str())
.unwrap_or("samples/zh.wav"),
);

let audio_duration = get_audio_duration(&wav_path)?;
println!("Audio duration: {:.2}s", audio_duration);

let quantization = if int8 {
Quantization::Int8
} else {
Quantization::FP32
};

println!("Using Zipformer Transducer engine");
println!(
"Loading model: {:?} (quantization: {})",
model_path,
if int8 { "int8" } else { "fp32" }
);

let load_start = Instant::now();
let mut model = ZipformerTransducerModel::load(&model_path, &quantization)?;
let load_duration = load_start.elapsed();
println!("Model loaded in {:.2?}", load_duration);

println!("Transcribing file: {:?}", wav_path);
let transcribe_start = Instant::now();

let result = model.transcribe_file(&wav_path, &transcribe_rs::TranscribeOptions::default())?;
let transcribe_duration = transcribe_start.elapsed();
println!("Transcription completed in {:.2?}", transcribe_duration);

let speedup_factor = audio_duration / transcribe_duration.as_secs_f64();
println!(
"Real-time speedup: {:.2}x faster than real-time",
speedup_factor
);

println!("Transcription result:");
println!("{}", result.text);

if let Some(segments) = result.segments {
println!("\nSegments:");
for segment in segments {
println!(
"[{:.2}s - {:.2}s]: {}",
segment.start, segment.end, segment.text
);
}
}

Ok(())
}
Loading