diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md new file mode 100644 index 0000000000..468cc8a3f8 --- /dev/null +++ b/docs/serving/speech_api.md @@ -0,0 +1,292 @@ +# Speech API + +vLLM-Omni provides an OpenAI-compatible API for text-to-speech (TTS) generation using Qwen3-TTS models. + +Each server instance runs a single model (specified at startup via `vllm serve --omni`). + +## Quick Start + +### Start the Server + +```bash +# CustomVoice model (predefined speakers) +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --omni --port 8000 --trust-remote-code --enforce-eager +``` + +### Generate Speech + +**Using curl:** + +```bash +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello, how are you?", + "voice": "vivian", + "language": "English" + }' --output output.wav +``` + +**Using Python:** + +```python +import httpx + +response = httpx.post( + "http://localhost:8000/v1/audio/speech", + json={ + "input": "Hello, how are you?", + "voice": "vivian", + "language": "English", + }, + timeout=300.0, +) + +with open("output.wav", "wb") as f: + f.write(response.content) +``` + +**Using OpenAI SDK:** + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8000/v1", api_key="none") + +response = client.audio.speech.create( + model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + voice="vivian", + input="Hello, how are you?", +) + +response.stream_to_file("output.wav") +``` + +## API Reference + +### Endpoint + +``` +POST /v1/audio/speech +Content-Type: application/json +``` + +### Request Parameters + +#### OpenAI Standard Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input` | string | **required** | The text to synthesize into speech | +| `model` | string | server's model | Model to use (optional, should match server if specified) | +| `voice` | string | "vivian" | Speaker name (e.g., vivian, ryan, aiden) | +| `response_format` | string | "wav" | Audio format: wav, mp3, flac, pcm, aac, opus | +| `speed` | float | 1.0 | Playback speed (0.25-4.0) | + +#### vLLM-Omni Extension Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `task_type` | string | "CustomVoice" | TTS task type: CustomVoice, VoiceDesign, or Base | +| `language` | string | "Auto" | Language (see supported languages below) | +| `instructions` | string | "" | Voice style/emotion instructions | +| `max_new_tokens` | integer | 2048 | Maximum tokens to generate | + +**Supported languages:** Auto, Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian + +#### Voice Clone Parameters (Base task) + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `ref_audio` | string | null | Reference audio (URL or base64 data URL) | +| `ref_text` | string | null | Transcript of reference audio | +| `x_vector_only_mode` | bool | null | Use speaker embedding only (no ICL) | + +### Response Format +Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/wav`). +### Voices Endpoint +``` +GET /v1/audio/voices +``` + +Lists available voices for the loaded model. + +```json +{ + "voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian"] +} +``` + +``` +POST /v1/audio/voices +Content-Type: multipart/form-data +``` + +Upload a new voice sample for voice cloning in Base task TTS requests. + +**Form Parameters:** + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `audio_sample` | file | Yes | Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) | +| `consent` | string | Yes | Consent recording ID | +| `name` | string | Yes | Name for the new voice | + +**Response Example:** + +```json +{ + "success": true, + "voice": { + "name": "custom_voice_1", + "consent": "user_consent_id", + "file_path": "/tmp/voice_samples/custom_voice_1_user_consent_id_1738660000.wav", + "created_at": 1738660000, + "mime_type": "audio/wav", + "file_size": 1024000 + } +} +``` + +**Usage Example:** + +```bash +curl -X POST http://localhost:8000/v1/audio/voices \ + -F "audio_sample=@/path/to/voice_sample.wav" \ + -F "consent=user_consent_id" \ + -F "name=custom_voice_1" +``` + + +## Examples + +### CustomVoice with Style Instruction + +```bash +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "input": "I am so excited!", + "voice": "vivian", + "instructions": "Speak with great enthusiasm" + }' --output excited.wav +``` + +### VoiceDesign (Natural Language Voice Description) + +```bash +# Start server with VoiceDesign model first +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --omni --port 8000 --trust-remote-code --enforce-eager +``` + +```bash +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello world", + "task_type": "VoiceDesign", + "instructions": "A warm, friendly female voice with a gentle tone" + }' --output designed.wav +``` + +### Base (Voice Cloning) + +```bash +# Start server with Base model first +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --omni --port 8000 --trust-remote-code --enforce-eager +``` + +```bash +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello, this is a cloned voice", + "task_type": "Base", + "ref_audio": "https://example.com/reference.wav", + "ref_text": "Original transcript of the reference audio" + }' --output cloned.wav +``` + +## Supported Models + +| Model | Task Type | Description | +|-------|-----------|-------------| +| `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | CustomVoice | Predefined speaker voices with optional style control | +| `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | VoiceDesign | Natural language voice style description | +| `Qwen/Qwen3-TTS-12Hz-1.7B-Base` | Base | Voice cloning from reference audio | +| `Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice` | CustomVoice | Smaller/faster variant | +| `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | Base | Smaller/faster variant for voice cloning | + +## Error Responses + +### 400 Bad Request + +Invalid parameters: + +```json +{ + "error": { + "message": "Input text cannot be empty", + "type": "BadRequestError", + "param": null, + "code": 400 + } +} +``` + +### 404 Not Found + +Model not found: + +```json +{ + "error": { + "message": "The model `xxx` does not exist.", + "type": "NotFoundError", + "param": "model", + "code": 404 + } +} +``` + +## Troubleshooting + +### "TTS model did not produce audio output" + +Ensure you're using the correct model variant for your task type: +- CustomVoice task → CustomVoice model +- VoiceDesign task → VoiceDesign model +- Base task → Base model + +### Server Not Running + +```bash +# Check if server is responding +curl http://localhost:8000/v1/audio/voices +``` + +### Out of Memory + +If you encounter OOM errors: +1. Use smaller model variant: `Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice` +2. Reduce `--gpu-memory-utilization` + +### Unsupported Speaker + +Use `/v1/audio/voices` to list available voices for the loaded model. + +## Development + +Enable debug logging: + +```bash +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --omni --uvicorn-log-level debug +``` \ No newline at end of file diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 1c9bd48203..021ffadb7b 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -82,12 +82,62 @@ curl http://localhost:8000/v1/audio/voices ## API Reference -### Endpoint +### Endpoints +#### GET /v1/audio/voices +List all available voices/speakers from the loaded model, including both built-in model voices and uploaded custom voices. + +**Response Example:** +```json +{ + "voices": ["vivian", "ryan", "custom_voice_1"], + "uploaded_voices": [ + { + "name": "custom_voice_1", + "consent": "user_consent_id", + "created_at": 1738660000, + "file_size": 1024000, + "mime_type": "audio/wav" + } + ] +} ``` -POST /v1/audio/speech + +#### POST /v1/audio/voices + +Upload a new voice sample for voice cloning in Base task TTS requests. + +**Form Parameters:** +- `audio_sample` (required): Audio file (max 10MB, supported formats: wav, mp3, flac, ogg, aac, webm, mp4) +- `consent` (required): Consent recording ID +- `name` (required): Name for the new voice + +**Response Example:** +```json +{ + "success": true, + "voice": { + "name": "custom_voice_1", + "consent": "user_consent_id", + "created_at": 1738660000, + "mime_type": "audio/wav", + "file_size": 1024000 + } +} +``` + +**Usage Example:** +```bash +curl -X POST http://localhost:8000/v1/audio/voices \ + -F "audio_sample=@/path/to/voice_sample.wav" \ + -F "consent=user_consent_id" \ + -F "name=custom_voice_1" ``` + +#### POST /v1/audio/speech + + This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/docs/api-reference/audio/createSpeech) format with additional Qwen3-TTS parameters. ### Request Body diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index b5e48ab959..0a82c8e818 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -815,8 +815,80 @@ async def list_voices(raw_request: Request): if handler is None: return base(raw_request).create_error_response(message="The model does not support Speech API") + # Get all speakers (both model built-in and uploaded) speakers = sorted(handler.supported_speakers) if handler.supported_speakers else [] - return JSONResponse(content={"voices": speakers}) + + # Get uploaded speakers details + uploaded_speakers = [] + if hasattr(handler, 'uploaded_speakers'): + for voice_name, info in handler.uploaded_speakers.items(): + uploaded_speakers.append({ + "name": info.get("name", voice_name), + "consent": info.get("consent", ""), + "created_at": info.get("created_at", 0), + "file_size": info.get("file_size", 0), + "mime_type": info.get("mime_type", "") + }) + + return JSONResponse(content={ + "voices": speakers, + "uploaded_voices": uploaded_speakers + }) + + +@router.post( + "/v1/audio/voices", + responses={ + HTTPStatus.OK.value: {"model": dict}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def upload_voice( + raw_request: Request, + audio_sample: UploadFile = File(...), + consent: str = Form(...), + name: str = Form(...), +): + """Upload a new voice sample for voice cloning. + + Uploads an audio file that can be used as a reference for voice cloning + in Base task TTS requests. The voice can then be referenced by name + in subsequent TTS requests. + + Args: + audio_sample: Audio file (max 10MB) + consent: Consent recording ID + name: Name for the new voice + raw_request: Raw FastAPI request + + Returns: + JSON response with voice information + """ + handler = Omnispeech(raw_request) + if handler is None: + return base(raw_request).create_error_response(message="The model does not support Speech API") + + try: + # Validate required parameters + if not consent: + return base(raw_request).create_error_response(message="consent is required") + if not name: + return base(raw_request).create_error_response(message="name is required") + + # Upload the voice + result = await handler.upload_voice(audio_sample, consent, name) + + return JSONResponse(content={ + "success": True, + "voice": result + }) + + except ValueError as e: + return base(raw_request).create_error_response(message=str(e)) + except Exception as e: + logger.exception(f"Failed to upload voice: {e}") + return base(raw_request).create_error_response(message=f"Failed to upload voice: {str(e)}") # Health and Model endpoints for diffusion mode diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index a8bae9e993..7dc631e746 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1,7 +1,13 @@ import asyncio +import json +import os +import re +import time +import base64 +from pathlib import Path from typing import Any -from fastapi import Request +from fastapi import Request, UploadFile from fastapi.responses import Response from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.logger import init_logger @@ -37,12 +43,59 @@ _TTS_MAX_NEW_TOKENS_MAX = 4096 +def _sanitize_filename(filename: str) -> str: + """Sanitize filename to prevent path traversal attacks. + + Only allows alphanumeric characters, underscores, hyphens, and dots. + Replaces any other characters with underscores. + """ + # Remove any path components + filename = os.path.basename(filename) + # Replace any non-alphanumeric, underscore, hyphen, or dot with underscore + sanitized = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) + # Ensure filename is not empty + if not sanitized: + sanitized = "file" + # Limit length to prevent potential issues + if len(sanitized) > 255: + sanitized = sanitized[:255] + return sanitized + + +def _validate_path_within_directory(file_path: Path, directory: Path) -> bool: + """Validate that file_path is within the specified directory. + + Prevents path traversal attacks by ensuring the resolved path + is within the target directory. + """ + try: + # Resolve both paths to absolute paths + file_path_resolved = file_path.resolve() + directory_resolved = directory.resolve() + # Check if file_path is within directory + return directory_resolved in file_path_resolved.parents or directory_resolved == file_path_resolved + except Exception: + return False + + class OmniOpenAIServingSpeech(OpenAIServing, AudioMixin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # Initialize uploaded speakers storage + speech_voice_samples_dir = os.environ.get("SPEECH_VOICE_SAMPLES", "/tmp/voice_samples") + self.uploaded_speakers_dir = Path(speech_voice_samples_dir) + self.uploaded_speakers_dir.mkdir(parents=True, exist_ok=True) + self.metadata_file = self.uploaded_speakers_dir / "metadata.json" + # Load supported speakers self.supported_speakers = self._load_supported_speakers() + # Load uploaded speakers + self.uploaded_speakers = self._load_uploaded_speakers() + # Merge supported speakers with uploaded speakers + self.supported_speakers.update(self.uploaded_speakers.keys()) + logger.info(f"Loaded {len(self.supported_speakers)} supported speakers: {sorted(self.supported_speakers)}") + logger.info(f"Loaded {len(self.uploaded_speakers)} uploaded speakers") def _load_supported_speakers(self) -> set[str]: """Load supported speakers (case-insensitive) from the model configuration.""" @@ -62,6 +115,163 @@ def _load_supported_speakers(self) -> set[str]: return set() + def _load_uploaded_speakers(self) -> dict[str, dict]: + """Load uploaded speakers from metadata file.""" + if not self.metadata_file.exists(): + return {} + + try: + with open(self.metadata_file, 'r') as f: + metadata = json.load(f) + return metadata.get("uploaded_speakers", {}) + except Exception as e: + logger.warning(f"Could not load uploaded speakers metadata: {e}") + return {} + + def _save_uploaded_speakers(self) -> None: + """Save uploaded speakers to metadata file.""" + try: + metadata = {"uploaded_speakers": self.uploaded_speakers} + with open(self.metadata_file, 'w') as f: + json.dump(metadata, f, indent=2) + except Exception as e: + logger.error(f"Could not save uploaded speakers metadata: {e}") + + def _get_uploaded_audio_data(self, voice_name: str) -> str | None: + """Get base64 encoded audio data for uploaded voice.""" + voice_name_lower = voice_name.lower() + if voice_name_lower not in self.uploaded_speakers: + return None + + speaker_info = self.uploaded_speakers[voice_name_lower] + file_path = Path(speaker_info["file_path"]) + + if not file_path.exists(): + logger.warning(f"Audio file not found for voice {voice_name}: {file_path}") + return None + + try: + # Read audio file + with open(file_path, 'rb') as f: + audio_bytes = f.read() + + # Encode to base64 + audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') + + # Get MIME type from file extension + mime_type = speaker_info.get("mime_type", "audio/wav") + + # Return as data URL + return f"data:{mime_type};base64,{audio_b64}" + except Exception as e: + logger.error(f"Could not read audio file for voice {voice_name}: {e}") + return None + + async def upload_voice(self, audio_file: UploadFile, consent: str, name: str) -> dict: + """Upload a new voice sample.""" + # Validate file size (max 10MB) + MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB + audio_file.file.seek(0, 2) # Seek to end + file_size = audio_file.file.tell() + audio_file.file.seek(0) # Reset to beginning + + if file_size > MAX_FILE_SIZE: + raise ValueError(f"File size exceeds maximum limit of 10MB. Got {file_size} bytes.") + + # Detect MIME type from filename if content_type is generic + mime_type = audio_file.content_type + if mime_type == "application/octet-stream": + # Simple MIME type detection based on file extension + filename_lower = audio_file.filename.lower() + if filename_lower.endswith(".wav"): + mime_type = "audio/wav" + elif filename_lower.endswith((".mp3", ".mpeg")): + mime_type = "audio/mpeg" + elif filename_lower.endswith(".flac"): + mime_type = "audio/flac" + elif filename_lower.endswith(".ogg"): + mime_type = "audio/ogg" + elif filename_lower.endswith(".aac"): + mime_type = "audio/aac" + elif filename_lower.endswith(".webm"): + mime_type = "audio/webm" + elif filename_lower.endswith(".mp4"): + mime_type = "audio/mp4" + else: + mime_type = "audio/wav" # Default + + # Validate MIME type + allowed_mime_types = { + "audio/mpeg", "audio/wav", "audio/x-wav", "audio/ogg", + "audio/aac", "audio/flac", "audio/webm", "audio/mp4" + } + + if mime_type not in allowed_mime_types: + raise ValueError(f"Unsupported MIME type: {mime_type}. Allowed: {allowed_mime_types}") + + # Normalize voice name + voice_name_lower = name.lower() + + # Check if voice already exists + if voice_name_lower in self.uploaded_speakers: + raise ValueError(f"Voice '{name}' already exists") + + # Sanitize name and consent to prevent path traversal + sanitized_name = _sanitize_filename(name) + sanitized_consent = _sanitize_filename(consent) + + # Generate filename with sanitized inputs + timestamp = int(time.time()) + file_suffix = Path(audio_file.filename).suffix + file_ext = file_suffix[1:] if file_suffix and len(file_suffix) > 1 else "wav" + # Sanitize file extension as well + sanitized_ext = _sanitize_filename(file_ext) + if not sanitized_ext or sanitized_ext == "file": + sanitized_ext = "wav" + + filename = f"{sanitized_name}_{sanitized_consent}_{timestamp}.{sanitized_ext}" + file_path = self.uploaded_speakers_dir / filename + + # Double-check that the path is within the upload directory + if not _validate_path_within_directory(file_path, self.uploaded_speakers_dir): + raise ValueError(f"Invalid file path: potential path traversal attack detected") + + # Save audio file + try: + with open(file_path, 'wb') as f: + content = await audio_file.read() + f.write(content) + except Exception as e: + raise ValueError(f"Failed to save audio file: {e}") + + # Update metadata + self.uploaded_speakers[voice_name_lower] = { + "name": name, + "consent": consent, + "file_path": str(file_path), + "created_at": timestamp, + "mime_type": mime_type, + "original_filename": audio_file.filename, + "file_size": file_size + } + + # Update supported speakers + self.supported_speakers.add(voice_name_lower) + + # Save metadata + self._save_uploaded_speakers() + + logger.info(f"Uploaded new voice '{name}' with consent ID '{consent}'") + + # Return voice information without exposing the server file path + return { + "name": name, + "consent": consent, + "created_at": timestamp, + "mime_type": mime_type, + "file_size": file_size + } + def _is_tts_model(self) -> bool: """Check if the current model is a supported TTS model.""" stage_list = getattr(self.engine_client, "stage_list", None) @@ -95,11 +305,21 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non # Validate Base task requirements if task_type == "Base": - if request.ref_audio is None: - return "Base task requires 'ref_audio' for voice cloning" - # Validate ref_audio format - if not (request.ref_audio.startswith(("http://", "https://")) or request.ref_audio.startswith("data:")): - return "ref_audio must be a URL (http/https) or base64 data URL (data:...)" + if request.voice is None: + if request.ref_audio is None: + return "Base task requires 'ref_audio' for voice cloning" + # Validate ref_audio format + if not (request.ref_audio.startswith(("http://", "https://")) or request.ref_audio.startswith("data:")): + return "ref_audio must be a URL (http/https) or base64 data URL (data:...)" + else: + # voice is not None + voice_lower = request.voice.lower() + if voice_lower in self.uploaded_speakers: + pass + else: + # need ref_audio + if request.ref_audio is None: + return f"Base task with built-in speaker '{request.voice}' requires 'ref_audio' for voice cloning" # Validate cross-parameter dependencies if task_type != "Base": @@ -155,6 +375,17 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any # Speaker (voice) if request.voice is not None: params["speaker"] = [request.voice] + + # If voice is an uploaded speaker and no ref_audio provided, auto-set it + if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None: + audio_data = self._get_uploaded_audio_data(request.voice) + if audio_data: + params["ref_audio"] = [audio_data] + params["x_vector_only_mode"] = [True] + logger.info(f"Auto-set ref_audio for uploaded voice: {request.voice}") + else: + raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") + elif params["task_type"][0] == "CustomVoice": params["speaker"] = ["Vivian"] # Default for CustomVoice