nemotron-voice-agent/config/env.jetson.example at v1.0.0 · NVIDIA-AI-Blueprints/nemotron-voice-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# ============================================================================
# NEMOTRON VOICE AGENT - ENVIRONMENT CONFIGURATION
# ============================================================================
# Copy this file to .env and configure the values for your deployment.
# Variables with values are defaults; commented variables are optional overrides.


# ----------------------------------------------------------------------------
# REQUIRED CREDENTIALS
# ----------------------------------------------------------------------------

# Your NVIDIA API key from https://build.nvidia.com
# Export it as an environment variable in your shell:
#   export NVIDIA_API_KEY=<your-nvidia-api-key>
# Docker Compose will automatically use the environment variable if this is left empty.
NVIDIA_API_KEY=

# Huggingface token needed for LLM model download
# Export it as an environment variable in your shell:
#   export HF_TOKEN=<your-huggingface-token>
# Docker Compose will automatically use the environment variable if this is left empty.
HF_TOKEN=

# ----------------------------------------------------------------------------
# TURN SERVER CREDENTIALS
# ----------------------------------------------------------------------------

TURN_SERVER_URL=
TURN_USERNAME=
TURN_PASSWORD=


# ----------------------------------------------------------------------------
# DOCKER IMAGE CONFIGURATION
# ----------------------------------------------------------------------------

# Version tag for the python-app and ui container image
IMAGE_VERSION=1.0.0-arm64


# ----------------------------------------------------------------------------
# PIPELINE CONFIGURATION
# ----------------------------------------------------------------------------

# Transport mode for the voice agent: "WEBSOCKET" or "WEBRTC"
# WEBSOCKET: Uses FastAPI WebSocket transport (pipeline_websocket.py)
# WEBRTC: Uses WebRTC transport (pipeline_webrtc.py) - Default
TRANSPORT=WEBRTC

# Path to the prompt catalog YAML file containing system prompts
PROMPT_FILE_PATH=./config/prompt.yaml

# Enable speculative speech processing for lower response latency
# When enabled, the bot starts generating responses before user finishes speaking
ENABLE_SPECULATIVE_SPEECH=false

# Voice Activity Detection (VAD) engine: "ASR" (recommended) or "Silero"
VAD_PROFILE=ASR

# Maximum conversation turns to retain in context
# For multilingual or emotion-aware use cases, set the limit to 3-5 for best accuracy
CHAT_HISTORY_LIMIT=20


# Audio dump directory for debugging and analysis
AUDIO_DUMP_PATH=./audio_dumps

# JSON file containing word-to-IPA mappings for pronunciation correction
TTS_IPA_FILE_PATH=./config/ipa.json

# Number of 10ms audio chunks to buffer for output (controls audio latency)
# Default: 5 chunks (50ms buffer) for WebRTC - optimized for low latency
# WebSocket: 10 chunks (100ms buffer) - more stable for network variations
# High Concurrency: 10-40 chunks (100-400ms buffer) - prevents audio glitches under load
AUDIO_OUT_10MS_CHUNKS=5

# Number of workers for HTTP server (handles concurrent connections)
WORKERS=1

# ----------------------------------------------------------------------------
# OPENTELEMETRY TRACING
# ----------------------------------------------------------------------------
ENABLE_TRACING=false
OTEL_CONSOLE_EXPORT=false
# Add Endpoint for OTEL Collector.
# For Phoenix local deployment: `docker run -p 6006:6006 -p 4317:4317 -i -t arizephoenix/phoenix:latest`
# For gRPC (port 4317): Use host:port format (e.g., localhost:4317 or phoenix:4317)
# For HTTP (port 4318): Use http://host:port format (e.g., http://localhost:4318)
#OTEL_EXPORTER_OTLP_ENDPOINT=phoenix:4317

# ----------------------------------------------------------------------------
# ASR (AUTOMATIC SPEECH RECOGNITION) CONFIGURATION
# ----------------------------------------------------------------------------

# ASR endpoint URL
# Example: grpc.nvcf.nvidia.com:443 (cloud) or localhost:50051 (local NIM)
ASR_SERVER_URL=localhost:50051

# ASR model identifier
ASR_MODEL_NAME=parakeet-1.1b-en-US-asr-streaming


# ----------------------------------------------------------------------------
# TTS (TEXT-TO-SPEECH) CONFIGURATION
# ----------------------------------------------------------------------------

# TTS endpoint URL
# Example: grpc.nvcf.nvidia.com:443 (cloud) or localhost:50051 (local NIM)
TTS_SERVER_URL=localhost:50051

# Default voice identifier (format: Model.Language.VoiceName)
TTS_VOICE_ID=Magpie-Multilingual.EN-US.Aria

# TTS model identifier
TTS_MODEL_NAME=magpie_tts_ensemble-Magpie-Multilingual

# Language code for speech synthesis
TTS_LANGUAGE=en-US


# ============================================================================
# LLM (LARGE LANGUAGE MODEL) CONFIGURATION
# ============================================================================

# Models: nvidia/Nemotron-Mini-4B-Instruct, nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8, Qwen/Qwen3-4B-Instruct-2507, RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized-w4a16
NVIDIA_LLM_URL=http://localhost:9000/v1
NVIDIA_LLM_MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16

# GPU memory utilization ratio for the LLM model (0.0 to 1.0)
# Controls how much GPU memory the model can use. Lower values leave more memory for other processes.
GPU_MEMORY_UTILIZATION=0.15

SYSTEM_PROMPT_SELECTOR=llama/flowershop