Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2fef061
Add eager input processing (#70)
peteonrails Jan 29, 2026
a0eec37
feat: Add media key names and numeric keycode support
lokkju Jan 30, 2026
4c4a61b
feat: Add eitype output driver for libei protocol
lokkju Jan 30, 2026
fd7c33b
Merge feature/media-keys-and-numeric-keycodes
lokkju Jan 30, 2026
27154b2
Merge feature/eitype-output
lokkju Jan 30, 2026
be817e8
feat: Show eitype in setup check output
lokkju Jan 30, 2026
a80af56
Merge feature/eitype-output (setup check update)
lokkju Jan 30, 2026
327313c
Merge upstream feature/70-eager-processing-fresh
lokkju Jan 30, 2026
007311b
feat: Add Nemotron streaming transcription
lokkju Jan 30, 2026
010e896
docs: Add Nemotron streaming documentation
lokkju Jan 30, 2026
3f6f501
style: Format streaming transcription source files
lokkju Jan 30, 2026
6f46ddb
feat: Add Nemotron model download support
lokkju Jan 30, 2026
0776599
fix: Update validate_parakeet_model to recognize Nemotron file structure
lokkju Jan 30, 2026
1208b15
fix: Auto-detect Nemotron model type for streaming mode
lokkju Jan 30, 2026
fa62706
fix: Pre-load streaming transcriber at startup instead of per-recording
lokkju Jan 30, 2026
e316a7a
fix: Persistent streaming channels and proper flush collection
lokkju Jan 30, 2026
88d1717
fix: Remove redundant delta tracking from streaming transcriber
lokkju Jan 30, 2026
fb50de3
fix: Update ROCm execution provider to MIGraphX for parakeet-rs 0.3.x
lokkju Jan 30, 2026
d5ff505
fix: Collect all in-flight text on recording stop, not just flush
lokkju Jan 30, 2026
b96c3b2
feat: Add int8 and int4 Nemotron model entries
lokkju Jan 30, 2026
31ead15
fix: Detect all Parakeet models in setup check
lokkju Jan 30, 2026
c69fa0f
feat: Add cached output functions for streaming
lokkju Jan 31, 2026
4b98d2e
perf: Cache output chain in daemon for streaming
lokkju Jan 31, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ rodio = { version = "0.19", default-features = false, features = ["wav"] }
whisper-rs = "0.15.1"

# Parakeet speech-to-text (optional, ONNX-based)
parakeet-rs = { version = "0.2.9", optional = true }
parakeet-rs = { version = "0.3.1", optional = true }


# CPU count for thread detection
Expand All @@ -89,7 +89,7 @@ gpu-hipblas = ["whisper-rs/hipblas"]
parakeet = ["dep:parakeet-rs"]
parakeet-cuda = ["parakeet", "parakeet-rs/cuda"]
parakeet-tensorrt = ["parakeet", "parakeet-rs/tensorrt"]
parakeet-rocm = ["parakeet", "parakeet-rs/rocm"]
parakeet-rocm = ["parakeet", "parakeet-rs/migraphx"]
# Dynamic loading for system ONNX Runtime (used by Nix builds)
parakeet-load-dynamic = ["parakeet", "parakeet-rs/load-dynamic"]

Expand Down
46 changes: 46 additions & 0 deletions contrib/nemotron-streaming-test-config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Voxtype test config: Nemotron streaming transcription
#
# Setup:
# 1. Build with parakeet support:
# cargo build --features parakeet
#
# 2. Download the Nemotron model:
# voxtype setup model
# (select "nemotron-speech-streaming-en-0.6b")
#
# 3. Run the daemon:
# voxtype -c contrib/nemotron-streaming-test-config.toml daemon

engine = "parakeet"

[hotkey]
key = "Super_R"
enabled = true
mode = "push-to-talk"

[audio]
device = "default"
max_duration_secs = 30

[audio.feedback]
enabled = true

[parakeet]
# Downloaded via: voxtype setup model
model = "nemotron-speech-streaming-en-0.6b"

# Auto-detected from model files, but you can force it:
# model_type = "nemotron"

# Streaming is auto-enabled for Nemotron models.
# Set to false to use batch transcription instead:
# streaming = false

[output]
mode = "type"
state_file = "auto"

[output.notification]
on_recording_start = true
on_recording_stop = true
on_transcription = true
172 changes: 166 additions & 6 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ The main key to hold for recording. Must be a valid Linux evdev key name.
- `PAUSE` - Pause/Break key
- `RIGHTALT` - Right Alt key
- `F13` through `F24` - Extended function keys
- `MEDIA` - Media key (often a dedicated button on multimedia keyboards)
- `RECORD` - Record key
- `INSERT` - Insert key
- `HOME` - Home key
- `END` - End key
Expand All @@ -74,10 +76,25 @@ The main key to hold for recording. Must be a valid Linux evdev key name.
key = "PAUSE"
```

**Numeric keycodes:**

You can also specify keys by their numeric keycode if the key name isn't in the built-in list. Use a prefix to indicate the source tool, since different tools report different numbers for the same key:

- `WEV_234` or `X11_234` or `XEV_234` - XKB keycode as shown by `wev` or `xev` (offset by 8 from the kernel value)
- `EVTEST_226` - kernel keycode as shown by `evtest`
- Hex values are also accepted: `WEV_0xEA`, `EVTEST_0xE2`

Bare numeric values (e.g. `226`) are not accepted because `wev`/`xev` and `evtest` report different numbers for the same key.

**Finding key names:**
```bash
# Using evtest (shows kernel keycodes):
sudo evtest
# Select keyboard, press desired key, note KEY_XXXX name

# Using wev on Wayland (shows XKB keycodes):
wev
# Press the key, note the keycode number — use with WEV_ prefix
```

### modifiers
Expand Down Expand Up @@ -593,6 +610,100 @@ voxtype --whisper-context-optimization daemon

**Note:** This setting only applies when using the local whisper backend (`backend = "local"`). It has no effect with remote transcription.

### eager_processing

**Type:** Boolean
**Default:** `false`
**Required:** No

Enable eager input processing. When enabled, audio is split into chunks and transcribed in parallel with continued recording, reducing perceived latency on slower machines.

**Values:**
- `false` (default) - Traditional mode: record all audio, then transcribe
- `true` - Eager mode: transcribe chunks while recording continues

**How it works:**

1. While recording, audio is split into fixed-size chunks (default 5 seconds)
2. Each chunk is sent for transcription as soon as it's ready
3. Recording continues while earlier chunks are being transcribed
4. When recording stops, all chunk results are combined

**When to use eager processing:**
- You have a slower CPU where transcription takes several seconds
- You regularly dictate longer passages (10+ seconds)
- You want to minimize the delay between speaking and text output

**When to keep default (`false`):**
- You have a fast CPU or GPU acceleration
- Your recordings are typically short (under 5 seconds)
- You want maximum transcription accuracy (single-pass is more consistent)

**Example:**
```toml
[whisper]
model = "base.en"
eager_processing = true
eager_chunk_secs = 5.0 # 5 second chunks
eager_overlap_secs = 0.5 # 0.5 second overlap
```

**CLI override:**
```bash
voxtype --eager-processing daemon
```

**Note:** Eager processing is experimental. There may be occasional word duplications or omissions at chunk boundaries.

### eager_chunk_secs

**Type:** Float
**Default:** `5.0`
**Required:** No

Duration of each audio chunk in seconds when eager processing is enabled.

**Example:**
```toml
[whisper]
eager_processing = true
eager_chunk_secs = 3.0 # Shorter chunks for faster feedback
```

**CLI override:**
```bash
voxtype --eager-processing --eager-chunk-secs 3.0 daemon
```

**Trade-offs:**
- Shorter chunks: Faster feedback, but more boundary artifacts
- Longer chunks: Better accuracy, but less parallelism benefit

### eager_overlap_secs

**Type:** Float
**Default:** `0.5`
**Required:** No

Overlap duration in seconds between adjacent chunks when eager processing is enabled. Overlap helps catch words that span chunk boundaries.

**Example:**
```toml
[whisper]
eager_processing = true
eager_chunk_secs = 5.0
eager_overlap_secs = 1.0 # More overlap for better boundary handling
```

**CLI override:**
```bash
voxtype --eager-processing --eager-overlap-secs 1.0 daemon
```

**Trade-offs:**
- More overlap: Better word boundary handling, slightly more processing
- Less overlap: Faster processing, but may miss words at boundaries

### initial_prompt

**Type:** String
Expand Down Expand Up @@ -893,6 +1004,7 @@ The model architecture type. Usually auto-detected based on files present in the
**Values:**
- `tdt` - Token-Duration-Transducer (recommended, proper punctuation)
- `ctc` - Connectionist Temporal Classification (faster, character-level)
- `nemotron` - Nemotron streaming transducer (supports real-time streaming output)

**Example:**
```toml
Expand All @@ -901,6 +1013,42 @@ model = "parakeet-tdt-0.6b-v3"
model_type = "tdt"
```

**Auto-detection:**

The model type is detected from the files in the model directory:

| Model Type | Required Files |
|-----------|---------------|
| TDT | `encoder-model.onnx`, `decoder_joint-model.onnx`, `vocab.txt` |
| CTC | `model.onnx` (or `model_int8.onnx`), `tokenizer.json` |
| Nemotron | `encoder.onnx`, `decoder_joint.onnx`, `tokenizer.model` |

### streaming

**Type:** Boolean
**Default:** Auto (enabled for Nemotron, disabled for TDT/CTC)
**Required:** No

When enabled, text is typed live during recording as the model produces output. Each audio chunk (560ms) is processed incrementally, and the resulting text is typed immediately at the cursor position.

Streaming is automatically enabled when using a Nemotron model and can be explicitly overridden.

**Example:**
```toml
[parakeet]
model = "/path/to/nemotron-model"
model_type = "nemotron"
streaming = true # This is the default for Nemotron
```

**To disable streaming for a Nemotron model (batch mode instead):**
```toml
[parakeet]
model = "/path/to/nemotron-model"
model_type = "nemotron"
streaming = false # Wait until recording stops, then transcribe all at once
```

### on_demand_loading

**Type:** Boolean
Expand All @@ -916,8 +1064,9 @@ model = "parakeet-tdt-0.6b-v3"
on_demand_loading = true # Free memory when not transcribing
```

### Complete Example
### Complete Examples

**TDT model (recommended for batch transcription):**
```toml
engine = "parakeet"

Expand All @@ -926,6 +1075,16 @@ model = "parakeet-tdt-0.6b-v3"
on_demand_loading = false # Keep model loaded for fast response
```

**Nemotron model (streaming transcription):**
```toml
engine = "parakeet"

[parakeet]
model = "/path/to/nemotron-0.6b"
model_type = "nemotron"
# streaming = true is the default for Nemotron
```

---

## [output]
Expand Down Expand Up @@ -1016,20 +1175,21 @@ fallback_to_clipboard = true # Use clipboard if typing drivers fail
### driver_order

**Type:** Array of strings
**Default:** `["wtype", "dotool", "ydotool", "clipboard", "xclip"]`
**Default:** `["wtype", "eitype", "dotool", "ydotool", "clipboard", "xclip"]`
**Required:** No

Custom order of output drivers to try when `mode = "type"`. Each driver is tried in sequence until one succeeds. This allows you to prefer specific drivers or exclude others entirely.

**Available drivers:**
- `wtype` - Wayland virtual keyboard (best CJK/Unicode support, wlroots compositors only)
- `wtype` - Wayland virtual keyboard protocol (best CJK/Unicode support, wlroots compositors only)
- `eitype` - Wayland via libei/EI protocol (works on GNOME, KDE, and compositors with libei support)
- `dotool` - uinput-based typing (supports keyboard layouts, works on X11/Wayland/TTY)
- `ydotool` - uinput-based typing (requires daemon, X11/Wayland/TTY)
- `clipboard` - Wayland clipboard via wl-copy
- `xclip` - X11 clipboard via xclip

**Default behavior (no driver_order set):**
The default chain is: wtype → dotool → ydotool → clipboard → xclip
The default chain is: wtype → eitype → dotool → ydotool → clipboard → xclip

**Examples:**

Expand All @@ -1046,8 +1206,8 @@ driver_order = ["dotool", "ydotool", "xclip"]
# Force single driver (no fallback)
driver_order = ["ydotool"]

# KDE/GNOME Wayland (wtype doesn't work)
driver_order = ["dotool", "ydotool", "clipboard"]
# GNOME/KDE Wayland (prefer eitype, wtype doesn't work)
driver_order = ["eitype", "dotool", "clipboard"]
```

**CLI override:**
Expand Down
41 changes: 41 additions & 0 deletions docs/SMOKE_TESTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,47 @@ voxtype record start && sleep 3 && voxtype record stop
journalctl --user -u voxtype --since "1 minute ago" | grep -E "Loading|Unloading"
```

## Eager Processing

Tests parallel transcription of audio chunks during recording:

```bash
# 1. Enable eager processing in config.toml:
# [whisper]
# eager_processing = true
# eager_chunk_secs = 3.0 # Use short chunks for visible testing
# eager_overlap_secs = 0.5

# 2. Restart daemon
systemctl --user restart voxtype

# 3. Record for 10+ seconds (to generate multiple chunks)
voxtype record start
sleep 12
voxtype record stop

# 4. Check logs for chunk processing:
journalctl --user -u voxtype --since "1 minute ago" | grep -iE "eager|chunk"
# Expected: "Spawning eager transcription for chunk 0"
# "Spawning eager transcription for chunk 1"
# "Chunk 0 completed"
# "Combined eager chunks"

# 5. Verify combined output is coherent (no obvious word duplication)
# The final transcription should read naturally

# 6. Test cancellation during eager recording
voxtype record start
sleep 5
voxtype record cancel
journalctl --user -u voxtype --since "30 seconds ago" | grep -iE "cancel|abort"
# Expected: chunk tasks are cancelled, no transcription output

# 7. Restore default (disabled) when done testing:
# [whisper]
# eager_processing = false
```

## Model Switching

```bash
Expand Down
Loading