improve long-text MP3 handling

dbccccccc · dbccccccc · commit 4d661c6966b3 · 2025-09-18T15:45:13.000+08:00
Bump version to 3.3.0-beta1 across code, docs, and UI. Update long-text auto-combine flows to force WAV output when MP3 would require ffmpeg, preventing runtime errors on stock deployments. Add metadata to combined responses to indicate actual and requested formats. Update changelog and .gitignore for new artifacts and scripts.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ local_config.py
 # Claude
 .claude/
 VERSION_BUMP_GUIDE.md
+scripts/test_audio_generation.py
+/artifacts
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.3.0-beta1] - 2025-09-22
+
+### Changed
+- Promoted the release to **beta1** now that HTTP endpoints are stable and long-text combining no longer depends on external ffmpeg tooling.
+- No additional functional changes beyond the alpha5 hotfixes; this is a stability re-tag for broader testing.
+
 ## [3.3.0-alpha5] - 2025-09-19
 
 ### Fixed
 - Restored lint compliance across the repo (flake8, import hygiene, line wrapping) so the release pipeline can publish successfully.
 - Hardened Docker smoke test to manage container lifecycle and surface logs when health checks fail.
+- Ensured Eventlet monkey patching happens before Flask imports to stop recursion-depth crashes and restore HTTP endpoint health.
+- Forced long-text auto-combine flows to request WAV when MP3 would require `ffmpeg`, avoiding runtime errors on stock deployments.
 
 ### Documentation
 - Updated release notes to point to the `alpha5` build.
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
 version_scheme = "no-guess-dev"
 local_scheme = "no-local-version"
 
-fallback_version = "3.3.0-alpha5"
+fallback_version = "3.3.0-beta1"
 [tool.setuptools]
 packages = ["ttsfm"]
 
diff --git a/ttsfm-web/app.py b/ttsfm-web/app.py
@@ -531,6 +531,10 @@ def generate_speech():
             ), 400
 
         effective_format = get_supported_format(format_enum)
+        if len(input_text) > max_length and auto_combine and effective_format is AudioFormat.MP3:
+            effective_format = AudioFormat.WAV
+        if len(text) > max_length and effective_format is AudioFormat.MP3:
+            effective_format = AudioFormat.WAV
 
         logger.info(
             "Generating speech: text='%s...', voice=%s, requested_format=%s (effective=%s)",
@@ -656,8 +660,9 @@ def generate_speech_combined():
             return jsonify({"error": "Invalid voice or format specified"}), 400
 
         logger.info(
-            "Generating combined speech for long text: %s characters, splitting into chunks",
+            "Combining long text (%s chars) using format %s",
             len(text),
+            effective_format.value,
         )
 
         # Generate speech chunks
@@ -667,7 +672,7 @@ def generate_speech_combined():
             responses = client.generate_speech_long_text(
                 text=text,
                 voice=voice_enum,
-                response_format=format_enum,
+                response_format=effective_format,
                 instructions=instructions,
                 max_length=max_length,
                 preserve_words=preserve_words,
@@ -717,8 +722,10 @@ def generate_speech_combined():
             'X-Audio-Size': str(len(combined_audio)),
             'X-Chunks-Combined': str(len(responses)),
             'X-Original-Text-Length': str(len(text)),
+            'X-Auto-Combine': 'true',
+            'X-Powered-By': 'TTSFM-OpenAI-Compatible',
             'X-Requested-Format': format_enum.value,
-            'X-Effective-Format': get_supported_format(format_enum).value
+            'X-Effective-Format': effective_format.value
         }
 
         return Response(
@@ -770,7 +777,7 @@ def get_status():
         return jsonify({
             "status": "online",
             "tts_service": "openai.fm (free)",
-            "package_version": "3.3.0-alpha5",
+            "package_version": "3.3.0-beta1",
             "timestamp": datetime.now().isoformat()
         })
 
@@ -789,7 +796,7 @@ def health_check():
     """Simple health check endpoint."""
     return jsonify({
         "status": "healthy",
-        "package_version": "3.3.0-alpha5",
+        "package_version": "3.3.0-beta1",
         "timestamp": datetime.now().isoformat()
     })
 
@@ -924,15 +931,16 @@ def openai_speech():
         if len(input_text) > max_length and auto_combine:
             # Long text with auto-combine enabled: split and combine
             logger.info(
-                "Long text detected (%s chars), auto-combining enabled",
+                "Long text detected (%s chars); auto-combining with format %s",
                 len(input_text),
+                effective_format.value,
             )
 
             # Generate speech chunks
             responses = client.generate_speech_long_text(
                 text=input_text,
                 voice=voice_enum,
-                response_format=format_enum,
+                response_format=effective_format,
                 instructions=instructions,
                 max_length=max_length,
                 preserve_words=True
diff --git a/ttsfm-web/run.py b/ttsfm-web/run.py
@@ -5,12 +5,9 @@
 
 import eventlet
 
-# MUST be the first imports for eventlet to work properly
-from app import DEBUG, HOST, PORT, app, socketio
-
 eventlet.monkey_patch()
 
-# Now import the app
+from app import DEBUG, HOST, PORT, app, socketio
 
 if __name__ == '__main__':
     print(f"Starting TTSFM with WebSocket support on {HOST}:{PORT}")
diff --git a/ttsfm-web/templates/base.html b/ttsfm-web/templates/base.html
@@ -88,7 +88,7 @@
             <a class="navbar-brand" href="{{ url_for('index') }}">
                 <i class="fas fa-microphone-alt me-2"></i>
                 <span class="fw-bold">TTSFM</span>
-                <span class="badge bg-primary ms-2 small">v3.3.0-alpha5</span>
+                <span class="badge bg-primary ms-2 small">v3.3.0-beta1</span>
             </a>
 
             <button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
@@ -159,7 +159,7 @@
                     <div class="d-flex align-items-center">
                         <i class="fas fa-microphone-alt me-2 text-primary"></i>
                         <strong class="text-dark">TTSFM</strong>
-                        <span class="ms-2 text-muted">v3.3.0-alpha5</span>
+                        <span class="ms-2 text-muted">v3.3.0-beta1</span>
                     </div>
                 </div>
                 <div class="col-md-6 text-md-end">
diff --git a/ttsfm/__init__.py b/ttsfm/__init__.py
@@ -60,7 +60,7 @@
 )
 from .utils import split_text_by_length, validate_text_length
 
-__version__ = "3.3.0-alpha5"
+__version__ = "3.3.0-beta1"
 __author__ = "dbcccc"
 __email__ = "120614547+dbccccccc@users.noreply.github.com"
 __description__ = "Text-to-Speech API Client with OpenAI compatibility"
diff --git a/ttsfm/async_client.py b/ttsfm/async_client.py
@@ -218,13 +218,15 @@ async def generate_speech_long_text(
         if not chunks:
             raise ValueError("No valid text chunks found after processing")
 
+        send_format = self._resolve_long_text_format(response_format, auto_combine)
+
         # Create requests for all chunks
         requests = []
         for chunk in chunks:
             request = TTSRequest(
                 input=chunk,
                 voice=voice,
-                response_format=response_format,
+                response_format=send_format,
                 instructions=instructions,
                 max_length=max_length,
                 validate_length=False,  # We already split the text
@@ -236,10 +238,42 @@ async def generate_speech_long_text(
         responses = await self.generate_speech_batch(requests=requests)
 
         if auto_combine:
-            return combine_responses(responses)
+            combined = combine_responses(responses)
+            original_format = self._normalise_format_value(response_format)
+            if combined.metadata is None:
+                combined.metadata = {}
+            combined.metadata.setdefault("actual_format", combined.format.value)
+            if original_format != combined.format.value:
+                combined.metadata["original_requested_format"] = original_format
+            return combined
 
         return responses
 
+    @staticmethod
+    def _normalise_format_value(response_format: Union[AudioFormat, str]) -> str:
+        if isinstance(response_format, AudioFormat):
+            return response_format.value
+        return str(response_format).lower()
+
+    def _resolve_long_text_format(
+        self,
+        response_format: Union[AudioFormat, str],
+        auto_combine: bool,
+    ) -> Union[AudioFormat, str]:
+        if not auto_combine:
+            return response_format
+
+        fmt_value = self._normalise_format_value(response_format)
+        try:
+            fmt_enum = AudioFormat(fmt_value)
+        except ValueError:
+            return AudioFormat.WAV
+
+        if fmt_enum is AudioFormat.MP3:
+            return AudioFormat.WAV
+
+        return response_format
+
     async def generate_speech_from_long_text(
         self,
         text: str,
diff --git a/ttsfm/client.py b/ttsfm/client.py
@@ -317,18 +317,50 @@ def generate_speech_long_text(
         responses = self.generate_speech_batch(
             text=text,
             voice=voice,
-            response_format=response_format,
+            response_format=self._resolve_long_text_format(response_format, auto_combine),
             instructions=instructions,
             max_length=max_length,
             preserve_words=preserve_words,
             **kwargs
         )
 
         if auto_combine:
-            return combine_responses(responses)
+            combined = combine_responses(responses)
+            original_format = self._normalise_format_value(response_format)
+            if combined.metadata is None:
+                combined.metadata = {}
+            combined.metadata.setdefault("actual_format", combined.format.value)
+            if original_format != combined.format.value:
+                combined.metadata["original_requested_format"] = original_format
+            return combined
 
         return responses
 
+    @staticmethod
+    def _normalise_format_value(response_format: Union[AudioFormat, str]) -> str:
+        if isinstance(response_format, AudioFormat):
+            return response_format.value
+        return str(response_format).lower()
+
+    def _resolve_long_text_format(
+        self,
+        response_format: Union[AudioFormat, str],
+        auto_combine: bool,
+    ) -> Union[AudioFormat, str]:
+        if not auto_combine:
+            return response_format
+
+        fmt_value = self._normalise_format_value(response_format)
+        try:
+            fmt_enum = AudioFormat(fmt_value)
+        except ValueError:
+            return AudioFormat.WAV
+
+        if fmt_enum is AudioFormat.MP3:
+            return AudioFormat.WAV
+
+        return response_format
+
     def _make_request(self, request: TTSRequest) -> TTSResponse:
         """
         Make the actual HTTP request to the openai.fm TTS service.

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@`
`60`	`60`	`)`
`61`	`61`	`from .utils import split_text_by_length, validate_text_length`
`62`	`62`
`63`		`-__version__ = "3.3.0-alpha5"`
	`63`	`+__version__ = "3.3.0-beta1"`
`64`	`64`	`__author__ = "dbcccc"`
`65`	`65`	`__email__ = "[email protected]"`
`66`	`66`	`__description__ = "Text-to-Speech API Client with OpenAI compatibility"`