dbccccccc
diff --git a/‎CHANGELOG.md‎
Lines changed: 13 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 0 deletions b/‎README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.zh.md‎
Lines changed: 7 additions & 0 deletions b/‎README.zh.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_clients.py‎
Lines changed: 45 additions & 0 deletions b/‎tests/test_clients.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎tests/test_web_app.py‎
Lines changed: 2 additions & 7 deletions b/‎tests/test_web_app.py‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎ttsfm-web/app.py‎
Lines changed: 49 additions & 24 deletions b/‎ttsfm-web/app.py‎
Lines changed: 49 additions & 24 deletions
diff --git a/‎ttsfm-web/static/js/playground-enhanced-fixed.js‎
Lines changed: 10 additions & 1 deletion b/‎ttsfm-web/static/js/playground-enhanced-fixed.js‎
Lines changed: 10 additions & 1 deletion
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.3.0-alpha4] - 2025-09-19
+
+### Changed
+- Enforced MP3 pass-through while mapping all other requested formats to WAV so the service returns predictable audio without failing compatibility checks.
+- Python clients now normalise outbound `response_format` payloads to the supported set and surface fallback metadata when a WAV result is returned.
+- Docker build workflow tags only `v*` image aliases to avoid duplicate semver tags without the `v` prefix.
+
+### Removed
+- Web playground and WebSocket demo no longer expose manual format selectors, reducing confusion around unavoidable WAV fallbacks.
+
+### Documentation
+- README (EN/ZH) clarifies the MP3-only guarantee and WAV fallback, and the UI copy was refreshed accordingly.
+
 ## [3.3.0-alpha3] - 2025-09-18
 
 ### Added
 
@@ -21,6 +21,7 @@ TTSFM provides both synchronous and asynchronous Python clients for text-to-spee
 - ⚡ **Async & Sync** - Both `asyncio` and synchronous clients available
 - 🗣️ **11 Voices** - All OpenAI-compatible voices (alloy, echo, fable, onyx, nova, shimmer, etc.)
 - 🎵 **6 Audio Formats** - MP3, WAV, OPUS, AAC, FLAC, PCM support
+- 🎼 **Format Fallback** - MP3 requests yield MP3; other OpenAI formats map cleanly to WAV for reliable playback
 - 🐳 **Docker Ready** - One-command deployment with web interface
 - 🌐 **Web Interface** - Interactive playground for testing voices and formats
 - 🔧 **CLI Tool** - Command-line interface for quick TTS generation
@@ -184,6 +185,9 @@ combined = client.generate_speech_long_text(
 )
 
 combined.save_to_file("long_text")  # Saves as long_text.mp3
+
+# Note: Only MP3 requests return MP3 data. Other formats (OPUS/AAC/FLAC/WAV/PCM)
+# are delivered as WAV while remaining API-compatible.
 ```
 
 #### OpenAI Python Client Compatibility
@@ -270,6 +274,9 @@ ttsfm "Hello, world!" --url http://localhost:7000 --output hello.mp3
 # Auto-combine long text into a single file
 ttsfm --text-file article.txt --output article.mp3 --split-long-text --auto-combine
 
+> **Heads-up:** The CLI accepts all OpenAI-compatible format options, but anything
+> other than `mp3` will be delivered as WAV by the free upstream service.
+
 # List available voices
 ttsfm --list-voices
 
 
@@ -21,6 +21,7 @@ TTSFM为文本转语音生成提供同步和异步Python客户端，使用逆向
 - ⚡ **异步和同步** - 提供`asyncio`和同步客户端
 - 🗣️ **11种声音** - 所有OpenAI兼容的声音（alloy、echo、fable、onyx、nova、shimmer等）
 - 🎵 **6种音频格式** - 支持MP3、WAV、OPUS、AAC、FLAC、PCM
+- 🎼 **格式回退** - 请求MP3时输出MP3；其他OpenAI格式会安全回退为WAV，保证兼容性
 - 🐳 **Docker就绪** - 一键部署，包含Web界面
 - 🌐 **Web界面** - 用于测试声音和格式的交互式试用平台
 - 🔧 **CLI工具** - 用于快速TTS生成的命令行界面
@@ -173,6 +174,9 @@ combined = client.generate_speech_long_text(
 )
 
 combined.save_to_file("long_text")  # 保存为 long_text.mp3
+
+# 提示：只有 MP3 请求会返回 MP3 数据，其余格式（OPUS/AAC/FLAC/WAV/PCM）
+# 会回退为 WAV，以确保兼容免费上游服务。
 ```
 
 #### OpenAI Python客户端兼容性
@@ -264,6 +268,9 @@ ttsfm --list-voices
 
 # 获取帮助
 ttsfm --help
+
+> **提示：** CLI 仍然接受所有 OpenAI 兼容格式参数，但除了 `mp3` 之外的选项都会
+> 回退为 WAV，这与免费上游服务的行为一致。
 ```
 
 ## ⚙️ 配置
 
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
 version_scheme = "no-guess-dev"
 local_scheme = "no-local-version"
 
-fallback_version = "3.3.0-alpha3"
+fallback_version = "3.3.0-alpha4"
 [tool.setuptools]
 packages = ["ttsfm"]
 
 
@@ -1,4 +1,5 @@
 import pytest
+import types
 
 from ttsfm.client import TTSClient
 from ttsfm.async_client import AsyncTTSClient
@@ -14,6 +15,50 @@ def _mk_response(data: bytes) -> TTSResponse:
     )
 
 
+class _DummyResponse:
+    def __init__(self, content_type: str, content: bytes, url: str = "https://example.test/audio"):
+        self.status_code = 200
+        self.headers = {"content-type": content_type}
+        self.content = content
+        self.url = url
+        self.text = ""
+
+    def json(self):  # pragma: no cover - not used on success path
+        return {}
+
+
+def test_sync_request_normalizes_non_mp3_format(monkeypatch):
+    client = TTSClient()
+    captured = {}
+
+    def fake_post(self, url, data=None, headers=None, timeout=None, verify=None):
+        captured["data"] = data
+        return _DummyResponse("audio/wav", b"RIFF" + b"\x00" * 64, url)
+
+    monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session))
+
+    response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.FLAC)
+
+    assert captured["data"]["response_format"] == "wav"
+    assert response.format is AudioFormat.WAV
+
+
+def test_sync_request_preserves_mp3_format(monkeypatch):
+    client = TTSClient()
+    captured = {}
+
+    def fake_post(self, url, data=None, headers=None, timeout=None, verify=None):
+        captured["data"] = data
+        return _DummyResponse("audio/mpeg", b"ID3" + b"\x00" * 64, url)
+
+    monkeypatch.setattr(client.session, "post", types.MethodType(fake_post, client.session))
+
+    response = client.generate_speech(text="hello", voice="alloy", response_format=AudioFormat.MP3)
+
+    assert captured["data"]["response_format"] == "mp3"
+    assert response.format is AudioFormat.MP3
+
+
 def test_sync_long_text_auto_combine(monkeypatch):
     client = TTSClient()
 
 
@@ -74,17 +74,12 @@ def from_wav(cls, buffer):
             cls.formats.append("wav")
             return DummySegment("wav")
 
-        @classmethod
-        def from_file(cls, buffer, format: str):
-            cls.formats.append(format)
-            return DummySegment(format)
-
     monkeypatch.setattr(audio_module, "AudioSegment", DummyAudioSegment)
 
     output = audio_module.combine_audio_chunks([b"one", b"two"], "opus")
 
-    assert output == b"opus:opusopus"
-    assert DummyAudioSegment.formats == ["opus", "opus"]
+    assert output == b"wav:wavwav"
+    assert DummyAudioSegment.formats == ["wav", "wav"]
 
 
 @pytest.mark.parametrize('header_name, header_value', [
 
@@ -29,6 +29,7 @@
 # Import the TTSFM package
 try:
     from ttsfm import TTSClient, Voice, AudioFormat, TTSException
+    from ttsfm.models import get_supported_format
     from ttsfm.audio import combine_audio_chunks
     from ttsfm.exceptions import APIException, NetworkException, ValidationException
     from ttsfm.utils import validate_text_length, split_text_by_length
@@ -37,6 +38,7 @@
     import sys
     sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
     from ttsfm import TTSClient, Voice, AudioFormat, TTSException
+    from ttsfm.models import get_supported_format
     from ttsfm.audio import combine_audio_chunks
     from ttsfm.exceptions import APIException, NetworkException, ValidationException
     from ttsfm.utils import validate_text_length, split_text_by_length
@@ -488,7 +490,12 @@ def generate_speech():
                 "error": f"Invalid format: {response_format}. Must be one of: {[f.value for f in AudioFormat]}"
             }), 400
 
-        logger.info(f"Generating speech: text='{text[:50]}...', voice={voice}, format={response_format}")
+        effective_format = get_supported_format(format_enum)
+
+        logger.info(
+            "Generating speech: text='%s...', voice=%s, requested_format=%s (effective=%s)",
+            text[:50], voice, response_format, effective_format.value
+        )
 
         client = create_tts_client()
         response = client.generate_speech(
@@ -503,7 +510,9 @@ def generate_speech():
         headers = {
             'Content-Disposition': f'attachment; filename="speech.{response.format.value}"',
             'X-Audio-Format': response.format.value,
-            'X-Audio-Size': str(response.size)
+            'X-Audio-Size': str(response.size),
+            'X-Requested-Format': format_enum.value,
+            'X-Effective-Format': effective_format.value
         }
 
         return Response(
@@ -559,16 +568,17 @@ def generate_speech_combined():
         if not text:
             return jsonify({"error": "Text is required"}), 400
 
+        try:
+            voice_enum = Voice(voice.lower())
+            format_enum = AudioFormat(response_format.lower())
+        except ValueError as e:
+            logger.warning(f"Invalid voice or format: {e}")
+            return jsonify({"error": "Invalid voice or format specified"}), 400
+
+        effective_format = get_supported_format(format_enum)
+
         # Check if text needs splitting
         if len(text) <= max_length:
-            # Text is short enough, use regular generation
-            try:
-                voice_enum = Voice(voice.lower())
-                format_enum = AudioFormat(response_format.lower())
-            except ValueError as e:
-                logger.warning(f"Invalid voice or format: {e}")
-                return jsonify({"error": "Invalid voice or format specified"}), 400
-
             client = create_tts_client()
 
             response = client.generate_speech(
@@ -584,7 +594,9 @@ def generate_speech_combined():
                 'Content-Disposition': f'attachment; filename="combined_speech.{response.format.value}"',
                 'X-Audio-Format': response.format.value,
                 'X-Audio-Size': str(response.size),
-                'X-Chunks-Combined': '1'
+                'X-Chunks-Combined': '1',
+                'X-Requested-Format': format_enum.value,
+                'X-Effective-Format': effective_format.value
             }
 
             return Response(
@@ -626,11 +638,12 @@ def generate_speech_combined():
         logger.info(f"Generated {len(responses)} chunks, combining into single audio file")
 
         # Extract audio data from responses
-        audio_chunks = [response.audio_data for response in responses]
+        audio_chunks = [resp.audio_data for resp in responses]
 
         # Combine audio chunks
         try:
-            combined_audio = combine_audio_chunks(audio_chunks, format_enum.value)
+            actual_format = responses[0].format
+            combined_audio = combine_audio_chunks(audio_chunks, actual_format.value)
         except Exception as e:
             logger.error(f"Failed to combine audio chunks: {e}")
             return jsonify({"error": "Failed to combine audio chunks"}), 500
@@ -644,11 +657,13 @@ def generate_speech_combined():
         logger.info(f"Successfully combined {len(responses)} chunks into single audio file ({len(combined_audio)} bytes)")
 
         combined_headers = {
-            'Content-Disposition': f'attachment; filename="combined_speech.{format_enum.value}"',
-            'X-Audio-Format': format_enum.value,
+            'Content-Disposition': f'attachment; filename="combined_speech.{actual_format.value}"',
+            'X-Audio-Format': actual_format.value,
             'X-Audio-Size': str(len(combined_audio)),
             'X-Chunks-Combined': str(len(responses)),
-            'X-Original-Text-Length': str(len(text))
+            'X-Original-Text-Length': str(len(text)),
+            'X-Requested-Format': format_enum.value,
+            'X-Effective-Format': get_supported_format(format_enum).value
         }
 
         return Response(
@@ -699,7 +714,7 @@ def get_status():
         return jsonify({
             "status": "online",
             "tts_service": "openai.fm (free)",
-            "package_version": "3.3.0-alpha3",
+            "package_version": "3.3.0-alpha4",
             "timestamp": datetime.now().isoformat()
         })
 
@@ -717,7 +732,7 @@ def health_check():
     """Simple health check endpoint."""
     return jsonify({
         "status": "healthy",
-        "package_version": "3.3.0-alpha3",
+        "package_version": "3.3.0-alpha4",
         "timestamp": datetime.now().isoformat()
     })
 
@@ -818,7 +833,12 @@ def openai_speech():
                 }
             }), 400
 
-        logger.info(f"OpenAI API: Generating speech: text='{input_text[:50]}...', voice={voice}, format={response_format}, auto_combine={auto_combine}")
+        effective_format = get_supported_format(format_enum)
+
+        logger.info(
+            "OpenAI API: Generating speech: text='%s...', voice=%s, requested_format=%s (effective=%s), auto_combine=%s",
+            input_text[:50], voice, response_format, effective_format.value, auto_combine
+        )
 
         client = create_tts_client()
 
@@ -847,8 +867,9 @@ def openai_speech():
                 }), 400
 
             # Extract audio data and combine
-            audio_chunks = [response.audio_data for response in responses]
-            combined_audio = combine_audio_chunks(audio_chunks, format_enum.value)
+            audio_chunks = [resp.audio_data for resp in responses]
+            actual_format = responses[0].format
+            combined_audio = combine_audio_chunks(audio_chunks, actual_format.value)
 
             if not combined_audio:
                 return jsonify({
@@ -865,12 +886,14 @@ def openai_speech():
 
             headers = {
                 'Content-Type': content_type,
-                'X-Audio-Format': format_enum.value,
+                'X-Audio-Format': actual_format.value,
                 'X-Audio-Size': str(len(combined_audio)),
                 'X-Chunks-Combined': str(len(responses)),
                 'X-Original-Text-Length': str(len(input_text)),
                 'X-Auto-Combine': 'true',
-                'X-Powered-By': 'TTSFM-OpenAI-Compatible'
+                'X-Powered-By': 'TTSFM-OpenAI-Compatible',
+                'X-Requested-Format': format_enum.value,
+                'X-Effective-Format': effective_format.value
             }
 
             return Response(
@@ -908,7 +931,9 @@ def openai_speech():
                 'X-Audio-Size': str(response.size),
                 'X-Chunks-Combined': '1',
                 'X-Auto-Combine': str(auto_combine).lower(),
-                'X-Powered-By': 'TTSFM-OpenAI-Compatible'
+                'X-Powered-By': 'TTSFM-OpenAI-Compatible',
+                'X-Requested-Format': format_enum.value,
+                'X-Effective-Format': effective_format.value
             }
 
             return Response(
 
@@ -39,7 +39,12 @@ const PlaygroundApp = (() => {
 
         checkAuthStatus();
         loadVoices();
-        loadFormats();
+
+        if (document.getElementById('format-select')) {
+            loadFormats();
+        } else {
+            state.format = 'mp3';
+        }
         updateCharCount();
         updateAudioSummary();
         updateActionButtons(false);
@@ -50,6 +55,10 @@ const PlaygroundApp = (() => {
         els.textInput = document.getElementById('text-input');
         els.voiceSelect = document.getElementById('voice-select');
         els.formatSelect = document.getElementById('format-select');
+        if (!els.formatSelect) {
+            els.formatSelect = document.createElement('select');
+            els.formatSelect.value = state.format;
+        }
         els.instructionsInput = document.getElementById('instructions-input');
         els.apiKeyInput = document.getElementById('api-key-input');
         els.maxLengthInput = document.getElementById('max-length-input');