Skip to content

Commit 9af0390

Browse files
marksverdheiclaude
andcommitted
fix(tts): address upstream review issues on voice upload API
Fixes security and logic issues flagged in upstream PR vllm-project#1201 review: Security: - Sanitize name/consent to alphanumeric/underscore/hyphen only - Validate resolved path stays within upload directory - Remove file_path from API responses (information disclosure) Logic bugs: - Base task validation now correctly requires ref_audio unless voice is specifically an uploaded voice (not just any voice name) - _get_uploaded_audio_data raises ValueError instead of returning None when audio file is missing, preventing silent failures Robustness: - Atomic metadata writes via tempfile + os.replace - File locking (fcntl.flock) on metadata.json reads and writes - Use Path().suffix for file extension extraction Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7ffb87a commit 9af0390

File tree

3 files changed

+88
-34
lines changed

3 files changed

+88
-34
lines changed

docs/serving/speech_api.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,6 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
142142
"voice": {
143143
"name": "custom_voice_1",
144144
"consent": "user_consent_id",
145-
"file_path": "/tmp/voice_samples/custom_voice_1_user_consent_id_1738660000.wav",
146145
"created_at": 1738660000,
147146
"mime_type": "audio/wav",
148147
"file_size": 1024000

examples/online_serving/qwen3_tts/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ Upload a new voice sample for voice cloning in Base task TTS requests.
126126
}
127127
```
128128

129+
> **Note:** Voice names are sanitized to alphanumeric characters, underscores, and hyphens. The `SPEECH_VOICE_SAMPLES` environment variable controls the storage directory (default: `/tmp/voice_samples`).
130+
129131
**Usage Example:**
130132
```bash
131133
curl -X POST http://localhost:8000/v1/audio/voices \

vllm_omni/entrypoints/openai/serving_speech.py

Lines changed: 86 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import asyncio
22
import base64
3+
import fcntl
34
import json
45
import os
6+
import re
57
import struct
8+
import tempfile
69
import time
710
from io import BytesIO
811
from pathlib import Path
@@ -83,59 +86,97 @@ def _load_supported_speakers(self) -> set[str]:
8386
return set()
8487

8588
def _load_uploaded_speakers(self) -> dict[str, dict]:
86-
"""Load uploaded speakers from metadata file."""
89+
"""Load uploaded speakers from metadata file (with shared file lock)."""
8790
if not self.metadata_file.exists():
8891
return {}
8992

9093
try:
9194
with open(self.metadata_file, 'r') as f:
92-
metadata = json.load(f)
95+
fcntl.flock(f, fcntl.LOCK_SH)
96+
try:
97+
metadata = json.load(f)
98+
finally:
99+
fcntl.flock(f, fcntl.LOCK_UN)
93100
return metadata.get("uploaded_speakers", {})
94101
except Exception as e:
95102
logger.warning(f"Could not load uploaded speakers metadata: {e}")
96103
return {}
97104

98105
def _save_uploaded_speakers(self) -> None:
99-
"""Save uploaded speakers to metadata file."""
106+
"""Save uploaded speakers to metadata file (atomic write with lock)."""
100107
try:
101108
metadata = {"uploaded_speakers": self.uploaded_speakers}
102-
with open(self.metadata_file, 'w') as f:
103-
json.dump(metadata, f, indent=2)
109+
# Write to a temp file then atomically rename to avoid corruption
110+
fd, tmp_path = tempfile.mkstemp(
111+
dir=self.uploaded_speakers_dir, suffix=".tmp"
112+
)
113+
try:
114+
with os.fdopen(fd, 'w') as f:
115+
fcntl.flock(f, fcntl.LOCK_EX)
116+
try:
117+
json.dump(metadata, f, indent=2)
118+
finally:
119+
fcntl.flock(f, fcntl.LOCK_UN)
120+
os.replace(tmp_path, self.metadata_file)
121+
except BaseException:
122+
# Clean up temp file on any failure
123+
try:
124+
os.unlink(tmp_path)
125+
except OSError:
126+
pass
127+
raise
104128
except Exception as e:
105129
logger.error(f"Could not save uploaded speakers metadata: {e}")
106130

107-
def _get_uploaded_audio_data(self, voice_name: str) -> str | None:
108-
"""Get base64 encoded audio data for uploaded voice."""
131+
def _get_uploaded_audio_data(self, voice_name: str) -> str:
132+
"""Get base64 encoded audio data for uploaded voice.
133+
134+
Raises:
135+
ValueError: If the voice is not an uploaded voice or the audio
136+
file is missing / unreadable.
137+
"""
109138
voice_name_lower = voice_name.lower()
110139
if voice_name_lower not in self.uploaded_speakers:
111-
return None
140+
raise ValueError(f"Voice '{voice_name}' is not an uploaded voice")
112141

113142
speaker_info = self.uploaded_speakers[voice_name_lower]
114143
file_path = Path(speaker_info["file_path"])
115144

116145
if not file_path.exists():
117-
logger.warning(f"Audio file not found for voice {voice_name}: {file_path}")
118-
return None
146+
raise ValueError(
147+
f"Audio file for uploaded voice '{voice_name}' is missing "
148+
f"(expected at {file_path}). Re-upload the voice."
149+
)
119150

120151
try:
121-
# Read audio file
122152
with open(file_path, 'rb') as f:
123153
audio_bytes = f.read()
124154

125-
# Encode to base64
126155
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
127-
128-
# Get MIME type from file extension
129156
mime_type = speaker_info.get("mime_type", "audio/wav")
130-
131-
# Return as data URL
132157
return f"data:{mime_type};base64,{audio_b64}"
133158
except Exception as e:
134-
logger.error(f"Could not read audio file for voice {voice_name}: {e}")
135-
return None
159+
raise ValueError(
160+
f"Could not read audio file for voice '{voice_name}': {e}"
161+
) from e
162+
163+
@staticmethod
164+
def _sanitize_name(value: str) -> str:
165+
"""Sanitize a user-provided name to prevent path traversal.
166+
167+
Only allows alphanumeric characters, underscores, and hyphens.
168+
"""
169+
sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", value)
170+
if not sanitized:
171+
raise ValueError(f"Name contains no valid characters: {value!r}")
172+
return sanitized
136173

137174
async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> dict:
138175
"""Upload a new voice sample."""
176+
# Sanitize inputs to prevent path traversal
177+
safe_name = self._sanitize_name(name)
178+
safe_consent = self._sanitize_name(consent)
179+
139180
# Validate file size (max 10MB)
140181
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
141182
audio_file.file.seek(0, 2) # Seek to end
@@ -148,7 +189,6 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) ->
148189
# Detect MIME type from filename if content_type is generic
149190
mime_type = audio_file.content_type
150191
if mime_type == "application/octet-stream":
151-
# Simple MIME type detection based on file extension
152192
filename_lower = audio_file.filename.lower()
153193
if filename_lower.endswith(".wav"):
154194
mime_type = "audio/wav"
@@ -183,12 +223,18 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) ->
183223
if voice_name_lower in self.uploaded_speakers:
184224
raise ValueError(f"Voice '{name}' already exists")
185225

186-
# Generate filename
226+
# Generate filename using sanitized inputs
187227
timestamp = int(time.time())
188-
file_ext = audio_file.filename.split('.')[-1] if '.' in audio_file.filename else "wav"
189-
filename = f"{name}_{consent}_{timestamp}.{file_ext}"
228+
raw_filename = audio_file.filename or ""
229+
file_ext = Path(raw_filename).suffix.lstrip(".") or "wav"
230+
filename = f"{safe_name}_{safe_consent}_{timestamp}.{file_ext}"
190231
file_path = self.uploaded_speakers_dir / filename
191232

233+
# Validate resolved path stays within upload directory
234+
resolved = file_path.resolve()
235+
if not str(resolved).startswith(str(self.uploaded_speakers_dir.resolve())):
236+
raise ValueError("Invalid file path — possible path traversal")
237+
192238
# Save audio file
193239
try:
194240
with open(file_path, 'wb') as f:
@@ -216,10 +262,10 @@ async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) ->
216262

217263
logger.info(f"Uploaded new voice '{name}' with consent ID '{consent}'")
218264

265+
# Return response without internal file_path (information disclosure)
219266
return {
220267
"name": name,
221268
"consent": consent,
222-
"file_path": str(file_path),
223269
"created_at": timestamp,
224270
"mime_type": mime_type,
225271
"file_size": file_size
@@ -257,12 +303,19 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non
257303
return f"Invalid speaker '{request.voice}'. Supported: {', '.join(sorted(self.supported_speakers))}"
258304

259305
# Validate Base task requirements
260-
if task_type == "Base" and request.voice is None:
261-
if request.ref_audio is None:
306+
# ref_audio is required unless voice refers to an uploaded speaker
307+
# (uploaded voices auto-populate ref_audio from stored audio)
308+
if task_type == "Base":
309+
is_uploaded_voice = (
310+
request.voice is not None
311+
and request.voice.lower() in self.uploaded_speakers
312+
)
313+
if request.ref_audio is None and not is_uploaded_voice:
262314
return "Base task requires 'ref_audio' for voice cloning"
263-
# Validate ref_audio format
264-
if not (request.ref_audio.startswith(("http://", "https://")) or request.ref_audio.startswith("data:")):
265-
return "ref_audio must be a URL (http/https) or base64 data URL (data:...)"
315+
if request.ref_audio is not None:
316+
# Validate ref_audio format when explicitly provided
317+
if not (request.ref_audio.startswith(("http://", "https://")) or request.ref_audio.startswith("data:")):
318+
return "ref_audio must be a URL (http/https) or base64 data URL (data:...)"
266319

267320
# Validate cross-parameter dependencies
268321
if task_type != "Base":
@@ -319,13 +372,13 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any
319372
if request.voice is not None:
320373
params["speaker"] = [request.voice]
321374

322-
# If voice is an uploaded speaker and no ref_audio provided, auto-set it
375+
# If voice is an uploaded speaker and no ref_audio provided, auto-set it.
376+
# _get_uploaded_audio_data raises ValueError if the file is missing.
323377
if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None:
324378
audio_data = self._get_uploaded_audio_data(request.voice)
325-
if audio_data:
326-
params["ref_audio"] = [audio_data]
327-
params["x_vector_only_mode"] = [True]
328-
logger.info(f"Auto-set ref_audio for uploaded voice: {request.voice}")
379+
params["ref_audio"] = [audio_data]
380+
params["x_vector_only_mode"] = [True]
381+
logger.info(f"Auto-set ref_audio for uploaded voice: {request.voice}")
329382
elif params["task_type"][0] == "CustomVoice":
330383
params["speaker"] = ["Vivian"] # Default for CustomVoice
331384

0 commit comments

Comments
 (0)