Uzbek_TTS/.env.example at latest_branch · nek1987/Uzbek_TTS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# =============================================================================
# Uzbek TTS API Configuration
# Copy this file to .env and adjust the values as needed
# =============================================================================

# -----------------------------------------------------------------------------
# Application Settings
# -----------------------------------------------------------------------------
APP_NAME="Uzbek TTS API"
APP_VERSION="1.0.0"
DEBUG=false

# -----------------------------------------------------------------------------
# Server Configuration
# -----------------------------------------------------------------------------
HOST=0.0.0.0
PORT=8000
WORKERS=1

# -----------------------------------------------------------------------------
# Model Paths
# IMPORTANT: Download the model first using: python scripts/download_model.py
# -----------------------------------------------------------------------------
MODEL_PATH=./models/UZ.safetensors
VOCAB_PATH=./config/uz_vocab.txt
CONFIG_PATH=./config/UZTTS_conf.yaml

# -----------------------------------------------------------------------------
# GPU Configuration (NVIDIA A40 Optimized)
# -----------------------------------------------------------------------------
# Device: auto, cuda, cpu (auto will detect GPU automatically)
DEVICE=auto

# Number of parallel workers (6-8 optimal for A40 with 48GB VRAM)
NUM_WORKERS=6

# Enable TF32 for Ampere GPUs (A40, A100, RTX 30xx/40xx)
# Provides 20-30% speedup with minimal quality loss
ENABLE_TF32=true

# Enable torch.compile() for additional 15-25% speedup
# First request will be slower due to compilation
ENABLE_COMPILE=true

# CUDA device ID (0 = first GPU, 1 = second GPU, etc.)
CUDA_VISIBLE_DEVICES=0

# -----------------------------------------------------------------------------
# Inference Configuration
# Balance between speed and quality
# -----------------------------------------------------------------------------
# NFE Steps: Number of inference steps
# - 32: Original (best quality, slower)
# - 18-20: Optimized (good quality, 2x faster) ✅ RECOMMENDED for A40
# - 12-16: Fast (acceptable quality, 3x faster)
NFE_STEPS=18

# CFG Strength: Classifier-free guidance strength
# - 2.0: Original (best quality, slower)
# - 1.5: Optimized (good quality, faster) ✅ RECOMMENDED for A40
# - 1.0: Fast (lower quality)
CFG_STRENGTH=1.5

# Target RMS for audio normalization (0.0-1.0)
TARGET_RMS=0.1

# Default speech speed (0.5-2.0)
DEFAULT_SPEED=1.0

# -----------------------------------------------------------------------------
# Cache Configuration
# -----------------------------------------------------------------------------
# LRU cache size for reference audio embeddings
# Larger cache = more memory, but faster for repeated references
# A40 with 48GB can handle 1000-2000 easily
CACHE_SIZE=1000

# Cache directory
CACHE_DIR=./cache

# Enable caching
ENABLE_CACHE=true

# -----------------------------------------------------------------------------
# Request Queue Configuration
# -----------------------------------------------------------------------------
# Maximum number of requests in queue
MAX_QUEUE_SIZE=50

# Queue timeout in seconds
QUEUE_TIMEOUT=300

# Redis URL (optional, for distributed queue)
# REDIS_URL=redis://localhost:6379/0
USE_REDIS=false

# -----------------------------------------------------------------------------
# API Limits
# -----------------------------------------------------------------------------
# Maximum text length in characters
MAX_TEXT_LENGTH=5000

# Maximum audio duration in seconds
MAX_AUDIO_DURATION=300

# Maximum batch size for batch processing
MAX_BATCH_SIZE=8

# -----------------------------------------------------------------------------
# CORS Configuration
# -----------------------------------------------------------------------------
# Allowed origins (comma-separated or * for all)
# Example: http://localhost:3000,https://yourdomain.com
CORS_ORIGINS=["*"]

# -----------------------------------------------------------------------------
# Monitoring
# -----------------------------------------------------------------------------
# Enable Prometheus metrics
ENABLE_METRICS=true

# Metrics port
METRICS_PORT=9090

# -----------------------------------------------------------------------------
# Logging
# -----------------------------------------------------------------------------
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
LOG_LEVEL=INFO

# Log directory
LOG_DIR=./logs

# -----------------------------------------------------------------------------
# File Upload
# -----------------------------------------------------------------------------
# Upload directory for reference audio
UPLOAD_DIR=./uploads

# Maximum upload size in bytes (10MB default)
MAX_UPLOAD_SIZE=10485760

# =============================================================================
# Performance Tuning Notes for NVIDIA A40
# =============================================================================
#
# A40 Specifications:
# - 48GB VRAM
# - 10,752 CUDA cores
# - Ampere architecture
# - 300W TDP
#
# Recommended Settings for Different Use Cases:
#
# 1. MAXIMUM QUALITY (slower)
#    NFE_STEPS=32
#    CFG_STRENGTH=2.0
#    NUM_WORKERS=4
#    Expected latency: 2-3s per 10s audio
#
# 2. BALANCED (recommended) ✅
#    NFE_STEPS=18
#    CFG_STRENGTH=1.5
#    NUM_WORKERS=6
#    Expected latency: 1-1.5s per 10s audio
#
# 3. MAXIMUM SPEED
#    NFE_STEPS=12
#    CFG_STRENGTH=1.0
#    NUM_WORKERS=8
#    Expected latency: 0.5-1s per 10s audio
#
# GPU Memory Usage:
# - Per worker: ~1.5-2GB VRAM
# - 6 workers: ~10-14GB VRAM (30% of A40)
# - Reserve: ~34GB for buffers and cache
#
# =============================================================================