voxtral_wyoming/.env.example at main · Johnson145/voxtral_wyoming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# Voxtral Wyoming STT - Environment Configuration
# Copy this file to .env and adjust values as needed

# ============================================================================
# ESSENTIAL CONFIGURATION - Start here!
# ============================================================================

# Voxtral model ID
#
# Generation 2 (recommended, requires transformers >= 5.2):
#   - mistralai/Voxtral-Mini-4B-Realtime-2602  (13 languages, improved accuracy)
#
# Generation 1 (requires transformers >= 4.57):
#   - mistralai/Voxtral-Mini-3B-2507           (8 languages)
#   - mistralai/Voxtral-Small-24B-2507         (8 languages, larger/more accurate)
#
# Local paths also work (e.g. /models/Voxtral-Mini-4B-Realtime-2602 if mounted).
# The model class is auto-detected at startup — no code change needed when switching generations.
#
# Quantized variants for low-resource / CPU-only deployments:
#   https://huggingface.co/models?other=base_model:quantized:mistralai/Voxtral-Mini-4B-Realtime-2602
#   https://huggingface.co/models?other=base_model:quantized:mistralai/Voxtral-Mini-3B-2507
MODEL_ID=mistralai/Voxtral-Mini-4B-Realtime-2602

# Data type for model weights (default: auto-detect from model)
# Leave unset for automatic detection from model files (RECOMMENDED).
#
# Only specify to override, e.g., when loading fp32 models on GPU:
# - bf16: Best for modern GPUs (RTX 30xx+, A100+) - ~50% memory reduction
# - fp16: For older GPUs - ~50% memory reduction but less stable than bf16
# - fp32: Maximum accuracy on GPU
#DATA_TYPE=

# Transcription delay in milliseconds (Gen2 realtime models only)
# Controls the trade-off between transcription latency and accuracy.
# Leave unset to use the model's built-in default (480ms).
#
# Valid values: multiples of 80 from 80 to 1200, or exactly 2400.
#
# | Delay   | WER    | Use Case                            |
# |---------|--------|-------------------------------------|
# | 80ms    | 12.60% | Lowest latency                      |
# | 480ms   | 8.72%  | Recommended balance (model default) |
# | 960ms   | 7.70%  | Higher accuracy                     |
# | 2400ms  | 6.73%  | Offline-level accuracy              |
#
# More benchmarks:
# https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602#benchmark-results
#
# This parameter is ignored for Gen1 models.
#TRANSCRIPTION_DELAY_MS=

# Device for inference (default: auto)
# Options:
#   - auto: Automatically detect the best available device (RECOMMENDED)
#           Will use cuda (NVIDIA GPU) if available, mps (Apple Silicon) if available, or cpu as fallback
#   - cuda: Force NVIDIA GPU usage
#   - mps: Force Apple Silicon GPU usage
#   - cpu: Force CPU usage
# Note: When manually specifying a device, it will automatically fall back to CPU if the specified device is unavailable
DEVICE=auto

# Use chat mode instead of transcribe-only mode (default: false)
# NOTE: Gen1 models only. Gen2 models do not support chat mode (the model lacks a chat template).
#
# Turn this on only if you're not satisfied by the results of the default
# transcribe-only mode and want to try out enhancing the accuracy by using
# a system prompt (see SYSTEM_PROMPT env).
# This can drastically improve the accuracy especially for smart-home-specific commands. For instance, it may prevent
# the transcription from trying to change short imperative sentences (`turn on the lights`) into more common larger
# ones (`I turn on the lights`). However, beaware that this may sometimes also cause unexpected and unpredictable results.
# That's why I'd rather call it an experimental feature for now. Tweaking the system prompt to your particular needs
# may produce really good results though.
USE_CHAT_MODE=false

# System prompt for chat mode (Gen1 only, requires USE_CHAT_MODE=true)
# This prompt provides context to guide the model's transcription behavior.
# Default prompt emphasizes smart home commands and imperative sentences.
# The assistant is likely to respond in the same language as this prompt is written. Thus, you should at least
# translate it into the desired language.
SYSTEM_PROMPT="You are a voice assistant for a smart home. Transcribe the user's voice command accurately. Commands are typically short, imperative sentences like 'turn on the lights' or 'set temperature to 20 degrees'. Focus on accuracy and be aware of smart home terminology."
# For instance, here is an alternative system prompt in German:
#SYSTEM_PROMPT="Es folgt eine Audio-Aufnahme, in welcher der Benutzer einen Sprachbefehl an sein Smart-Home-Assistenten gibt. Deine Aufgabe ist es diesen Sprachbefehl wortwörtlich zu transkribieren. Ignoriere Hintergrundgeräusche und andere Stimmen. Befehle sind in der Regel kurze eventuell unvollständige Sätze, welche im Imperativ formuliert sind. Du darfst nur den transkribierten Text aus der Audio-Aufnahme zurückgeben, ohne ihn in Anführungszeichen zu setzen oder Ähnliches. Übersetze die Benutzereingaben nicht, sondern behalte die ursprüngliche Eingabesprache (Deutsch) bei."

# ============================================================================
# Server Configuration
# ============================================================================

# Bind host (default: 0.0.0.0)
# Use 0.0.0.0 to listen on all interfaces, or 127.0.0.1 for localhost only
HOST=0.0.0.0

# Bind port (default: 10300)
# Wyoming protocol default port for STT services
PORT=10300

# ============================================================================
# Advanced Model Settings
# ============================================================================

# Maximum audio duration in seconds (default: 30)
# Audio longer than this will be clamped before processing.
# For gen1 and chat mode this also controls the generation limit:
# max_new_tokens is derived as max_seconds / 0.08 (1 token ≈ 80ms of audio).
# For gen2 realtime transcribe-only mode the model auto-determines output length.
MAX_SECONDS=30

# ============================================================================
# Language & Audio Settings
# ============================================================================

# Language/locale hint (default: en-US)
# This is just a fallback and will get overridden by the configuration of your Home Assistant Voice Assistant.
#
# Voxtral supported languages (depends on model generation):
#
# Generation 1 (8 languages): en-US, fr-FR, de-DE, es-ES, it-IT, pt-PT, nl-NL, hi-IN
#
# Generation 2 (13 languages): all of the above plus:
#   ar-SA (Arabic), zh-CN (Chinese), ja-JP (Japanese), ko-KR (Korean), ru-RU (Russian)
#
# NOTE on Gen2 models: Gen2 realtime models always auto-detect the spoken language
# from the audio itself. The language hint is accepted but has NO effect on the
# transcription — the underlying streaming tokenizer does not use it. This is a
# limitation of the model architecture, not of this project.
# Gen1 models do use the language hint to improve transcription accuracy.
LANGUAGE_FALLBACK=en-US

# Expected audio sample rate in Hz (default: 16000)
# Common values: 8000, 16000, 22050, 24000, 32000, 48000
# Again just a fallback value which will get replaced by the information which Home Assistant provides through the Wyoming protocol.
SAMPLE_RATE_FALLBACK=16000

# ============================================================================
# Post-Transcription Word Replacement
# ============================================================================

# Replace misheard words or phrases in the transcription output.
# Useful for fixing recurring STT mistakes (e.g., German "schaltet" vs "schalte").
# Matching is case-insensitive and respects word boundaries.

# Inline replacements (comma-separated, arrow notation)
# Example: WORD_REPLACEMENTS="schaltet -> schalte, Tor -> Tür"
#WORD_REPLACEMENTS=

# Path to a replacements file (one entry per line, # comments, blank lines ignored)
# See word_replacements.txt for the format. Edit that file to add your replacements.
# Docker Compose sets this automatically via docker-compose.yml; for non-Docker setups
# point it to your file (e.g. ./word_replacements.txt).
#WORD_REPLACEMENTS_FILE=

# When both are set, they are merged. Use the file approach for many entries
# or phrases containing commas.

# ============================================================================
# Logging & Debugging
# ============================================================================

# Logging level (default: INFO)
# Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
LOG_LEVEL=INFO

# Enable saving of received audio input (default: false)
# When enabled, creates one WAV file per transcription request
# Options: true, false
# WARNING: Audio files may contain sensitive information. Ensure proper access controls.
SAVE_AUDIO=false

# Directory where audio files will be saved (default: /output/audio)
# The directory will be created automatically if it doesn't exist
# Each file is named with a timestamp and the first 100 characters of the transcribed text:
#   audio_YYYYMMDD_HHMMSS_microseconds_transcribed_text.wav
# Special characters in the transcription are replaced with underscores for filesystem safety
#
# Docker: Use absolute path /output/audio (matches volume mount in docker-compose.yml)
# Non-Docker: Use relative path ./output/audio/ or any writable directory path
AUDIO_SAVE_DIR=/output/audio