voxtype/Cargo.toml at eb74b2440388e444b6775b1f13c3dee07efb6a85 · peteonrails/voxtype · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
[workspace]
members = ["xtask"]

[package]
name = "voxtype"
version = "0.6.3"
edition = "2021"
authors = ["Peter Jackson", "Jean-Paul van Tillo", "Máté Rémiás", "Rob Zolkos", "Dan Heuckeroth", "Igor Warzocha", "Julian Kaiser", "Kevin Miller", "konnsim", "reisset", "Zubair", "Loki Coyote", "Umesh", "Barrett Ruth", "André Silva", "Chmouel Boudjnah", "Christopher Albert", "Phuoc Thinh Vu", "Alexander Bosu-Kellett", "ayoahha", "Toizi", "kakapt"]
description = "Push-to-talk voice-to-text for Wayland"
license = "MIT"
readme = "README.md"
keywords = ["voice", "speech", "whisper", "wayland", "linux"]
categories = ["multimedia::audio", "accessibility"]

[dependencies]
# Async runtime
tokio = { version = "1", features = ["full", "signal", "sync", "time", "process", "io-util"] }

# CLI
clap = { version = "4", features = ["derive"] }

# Configuration
serde = { version = "1", features = ["derive"] }
toml = "0.8"
directories = "5"

# Logging
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }

# Error handling
thiserror = "1"
anyhow = "1"

# Text processing
regex = "1"

# Async traits
async-trait = "0.1"

# Input handling (evdev for kernel-level key events)
evdev = "0.12"
libc = "0.2"
inotify = "0.10"  # Watch /dev/input for device hotplug
nix = { version = "0.29", features = ["signal", "process"] }  # Unix signals for IPC

# Audio capture
cpal = "0.15"
hound = "3"  # WAV file reading/writing

# HTTP client for remote transcription
ureq = { version = "2", features = ["json"] }

# JSON parsing (for CLI backend)
serde_json = "1"

# CLI path resolution (for CLI backend)
which = "7"

# Temp files (for CLI backend audio)
tempfile = "3"

# Audio playback (for feedback sounds)
rodio = { version = "0.19", default-features = false, features = ["wav"] }

# Whisper speech-to-text
whisper-rs = "0.15.1"

# Parakeet speech-to-text (optional, ONNX-based)
parakeet-rs = { version = "^0.3.1", optional = true }

# ONNX-based ASR engines (Moonshine, SenseVoice, Paraformer, Dolphin, Omnilingual)
ort = { version = "2.0.0-rc.11", optional = true }
ndarray = { version = "0.16", optional = true }
tokenizers = { version = "0.20", optional = true, default-features = false, features = ["onig"] }
rustfft = { version = "6", optional = true }

# CPU count for thread detection
num_cpus = "1.16"

# File watching for status --follow
notify = "6"

# Single instance check
pidlock = "0.1"

# Meeting mode (Pro feature)
uuid = { version = "1", features = ["v4", "serde"] }
chrono = { version = "0.4", features = ["serde"] }
rusqlite = { version = "0.32", features = ["bundled"] }

[features]
default = []
gpu-vulkan = ["whisper-rs/vulkan"]
gpu-cuda = ["whisper-rs/cuda"]
gpu-metal = ["whisper-rs/metal"]
gpu-hipblas = ["whisper-rs/hipblas"]
# ML-based speaker diarization (uses ONNX for embedding extraction)
ml-diarization = ["dep:ort", "dep:ndarray"]
# Parakeet backend (ONNX-based, alternative to Whisper)
parakeet = ["dep:parakeet-rs"]
parakeet-cuda = ["parakeet", "parakeet-rs/cuda"]
parakeet-tensorrt = ["parakeet", "parakeet-rs/tensorrt"]
parakeet-rocm = ["parakeet", "parakeet-rs/migraphx"]
# Dynamic loading for system ONNX Runtime (used by Nix builds)
parakeet-load-dynamic = ["parakeet", "parakeet-rs/load-dynamic"]
# Shared ONNX dependencies for engines using fbank/CTC preprocessing
onnx-common = ["dep:ort", "dep:ndarray", "dep:rustfft"]
# Moonshine backend (ONNX-based, encoder-decoder ASR)
moonshine = ["onnx-common", "dep:tokenizers"]
moonshine-cuda = ["moonshine", "ort/cuda"]
moonshine-tensorrt = ["moonshine", "ort/tensorrt"]
# SenseVoice backend (ONNX-based, CTC encoder-only ASR)
sensevoice = ["onnx-common"]
sensevoice-cuda = ["sensevoice", "ort/cuda"]
sensevoice-tensorrt = ["sensevoice", "ort/tensorrt"]
# Paraformer backend (FunASR ONNX-based CTC encoder)
paraformer = ["onnx-common"]
paraformer-cuda = ["paraformer", "ort/cuda"]
paraformer-tensorrt = ["paraformer", "ort/tensorrt"]
# Dolphin backend (ONNX-based CTC encoder, dictation-optimized)
dolphin = ["onnx-common"]
dolphin-cuda = ["dolphin", "ort/cuda"]
dolphin-tensorrt = ["dolphin", "ort/tensorrt"]
# Omnilingual backend (FunASR ONNX-based, 50+ languages)
omnilingual = ["onnx-common"]
omnilingual-cuda = ["omnilingual", "ort/cuda"]
omnilingual-tensorrt = ["omnilingual", "ort/tensorrt"]

[build-dependencies]
clap = { version = "4", features = ["derive"] }
clap_mangen = "0.2"

[dev-dependencies]
dirs = "5.0"

[profile.release]
lto = true
codegen-units = 1
strip = true
opt-level = 3

[profile.dev]
opt-level = 1  # Faster dev builds, whisper still usable

[[bin]]
name = "voxtype"
path = "src/main.rs"

[patch.crates-io]
# Fix MIGraphX provider options for ROCm support
# https://github.com/pykeio/ort/issues/509
ort = { git = "https://github.com/pykeio/ort", rev = "5913ae0a6f4468cf8329ba0da71b560de31481dc" }