Skip to content

Commit 4d661c6

Browse files
committed
improve long-text MP3 handling
Bump version to 3.3.0-beta1 across code, docs, and UI. Update long-text auto-combine flows to force WAV output when MP3 would require ffmpeg, preventing runtime errors on stock deployments. Add metadata to combined responses to indicate actual and requested formats. Update changelog and .gitignore for new artifacts and scripts.
1 parent c7ee15d commit 4d661c6

File tree

9 files changed

+100
-19
lines changed

9 files changed

+100
-19
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,5 @@ local_config.py
158158
# Claude
159159
.claude/
160160
VERSION_BUMP_GUIDE.md
161+
scripts/test_audio_generation.py
162+
/artifacts

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,19 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [3.3.0-beta1] - 2025-09-22
9+
10+
### Changed
11+
- Promoted the release to **beta1** now that HTTP endpoints are stable and long-text combining no longer depends on external ffmpeg tooling.
12+
- No additional functional changes beyond the alpha5 hotfixes; this is a stability re-tag for broader testing.
13+
814
## [3.3.0-alpha5] - 2025-09-19
915

1016
### Fixed
1117
- Restored lint compliance across the repo (flake8, import hygiene, line wrapping) so the release pipeline can publish successfully.
1218
- Hardened Docker smoke test to manage container lifecycle and surface logs when health checks fail.
19+
- Ensured Eventlet monkey patching happens before Flask imports to stop recursion-depth crashes and restore HTTP endpoint health.
20+
- Forced long-text auto-combine flows to request WAV when MP3 would require `ffmpeg`, avoiding runtime errors on stock deployments.
1321

1422
### Documentation
1523
- Updated release notes to point to the `alpha5` build.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
8686
version_scheme = "no-guess-dev"
8787
local_scheme = "no-local-version"
8888

89-
fallback_version = "3.3.0-alpha5"
89+
fallback_version = "3.3.0-beta1"
9090
[tool.setuptools]
9191
packages = ["ttsfm"]
9292

ttsfm-web/app.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,10 @@ def generate_speech():
531531
), 400
532532

533533
effective_format = get_supported_format(format_enum)
534+
if len(input_text) > max_length and auto_combine and effective_format is AudioFormat.MP3:
535+
effective_format = AudioFormat.WAV
536+
if len(text) > max_length and effective_format is AudioFormat.MP3:
537+
effective_format = AudioFormat.WAV
534538

535539
logger.info(
536540
"Generating speech: text='%s...', voice=%s, requested_format=%s (effective=%s)",
@@ -656,8 +660,9 @@ def generate_speech_combined():
656660
return jsonify({"error": "Invalid voice or format specified"}), 400
657661

658662
logger.info(
659-
"Generating combined speech for long text: %s characters, splitting into chunks",
663+
"Combining long text (%s chars) using format %s",
660664
len(text),
665+
effective_format.value,
661666
)
662667

663668
# Generate speech chunks
@@ -667,7 +672,7 @@ def generate_speech_combined():
667672
responses = client.generate_speech_long_text(
668673
text=text,
669674
voice=voice_enum,
670-
response_format=format_enum,
675+
response_format=effective_format,
671676
instructions=instructions,
672677
max_length=max_length,
673678
preserve_words=preserve_words,
@@ -717,8 +722,10 @@ def generate_speech_combined():
717722
'X-Audio-Size': str(len(combined_audio)),
718723
'X-Chunks-Combined': str(len(responses)),
719724
'X-Original-Text-Length': str(len(text)),
725+
'X-Auto-Combine': 'true',
726+
'X-Powered-By': 'TTSFM-OpenAI-Compatible',
720727
'X-Requested-Format': format_enum.value,
721-
'X-Effective-Format': get_supported_format(format_enum).value
728+
'X-Effective-Format': effective_format.value
722729
}
723730

724731
return Response(
@@ -770,7 +777,7 @@ def get_status():
770777
return jsonify({
771778
"status": "online",
772779
"tts_service": "openai.fm (free)",
773-
"package_version": "3.3.0-alpha5",
780+
"package_version": "3.3.0-beta1",
774781
"timestamp": datetime.now().isoformat()
775782
})
776783

@@ -789,7 +796,7 @@ def health_check():
789796
"""Simple health check endpoint."""
790797
return jsonify({
791798
"status": "healthy",
792-
"package_version": "3.3.0-alpha5",
799+
"package_version": "3.3.0-beta1",
793800
"timestamp": datetime.now().isoformat()
794801
})
795802

@@ -924,15 +931,16 @@ def openai_speech():
924931
if len(input_text) > max_length and auto_combine:
925932
# Long text with auto-combine enabled: split and combine
926933
logger.info(
927-
"Long text detected (%s chars), auto-combining enabled",
934+
"Long text detected (%s chars); auto-combining with format %s",
928935
len(input_text),
936+
effective_format.value,
929937
)
930938

931939
# Generate speech chunks
932940
responses = client.generate_speech_long_text(
933941
text=input_text,
934942
voice=voice_enum,
935-
response_format=format_enum,
943+
response_format=effective_format,
936944
instructions=instructions,
937945
max_length=max_length,
938946
preserve_words=True

ttsfm-web/run.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,9 @@
55

66
import eventlet
77

8-
# MUST be the first imports for eventlet to work properly
9-
from app import DEBUG, HOST, PORT, app, socketio
10-
118
eventlet.monkey_patch()
129

13-
# Now import the app
10+
from app import DEBUG, HOST, PORT, app, socketio
1411

1512
if __name__ == '__main__':
1613
print(f"Starting TTSFM with WebSocket support on {HOST}:{PORT}")

ttsfm-web/templates/base.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
<a class="navbar-brand" href="{{ url_for('index') }}">
8989
<i class="fas fa-microphone-alt me-2"></i>
9090
<span class="fw-bold">TTSFM</span>
91-
<span class="badge bg-primary ms-2 small">v3.3.0-alpha5</span>
91+
<span class="badge bg-primary ms-2 small">v3.3.0-beta1</span>
9292
</a>
9393

9494
<button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
@@ -159,7 +159,7 @@
159159
<div class="d-flex align-items-center">
160160
<i class="fas fa-microphone-alt me-2 text-primary"></i>
161161
<strong class="text-dark">TTSFM</strong>
162-
<span class="ms-2 text-muted">v3.3.0-alpha5</span>
162+
<span class="ms-2 text-muted">v3.3.0-beta1</span>
163163
</div>
164164
</div>
165165
<div class="col-md-6 text-md-end">

ttsfm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
)
6161
from .utils import split_text_by_length, validate_text_length
6262

63-
__version__ = "3.3.0-alpha5"
63+
__version__ = "3.3.0-beta1"
6464
__author__ = "dbcccc"
6565
__email__ = "[email protected]"
6666
__description__ = "Text-to-Speech API Client with OpenAI compatibility"

ttsfm/async_client.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,13 +218,15 @@ async def generate_speech_long_text(
218218
if not chunks:
219219
raise ValueError("No valid text chunks found after processing")
220220

221+
send_format = self._resolve_long_text_format(response_format, auto_combine)
222+
221223
# Create requests for all chunks
222224
requests = []
223225
for chunk in chunks:
224226
request = TTSRequest(
225227
input=chunk,
226228
voice=voice,
227-
response_format=response_format,
229+
response_format=send_format,
228230
instructions=instructions,
229231
max_length=max_length,
230232
validate_length=False, # We already split the text
@@ -236,10 +238,42 @@ async def generate_speech_long_text(
236238
responses = await self.generate_speech_batch(requests=requests)
237239

238240
if auto_combine:
239-
return combine_responses(responses)
241+
combined = combine_responses(responses)
242+
original_format = self._normalise_format_value(response_format)
243+
if combined.metadata is None:
244+
combined.metadata = {}
245+
combined.metadata.setdefault("actual_format", combined.format.value)
246+
if original_format != combined.format.value:
247+
combined.metadata["original_requested_format"] = original_format
248+
return combined
240249

241250
return responses
242251

252+
@staticmethod
253+
def _normalise_format_value(response_format: Union[AudioFormat, str]) -> str:
254+
if isinstance(response_format, AudioFormat):
255+
return response_format.value
256+
return str(response_format).lower()
257+
258+
def _resolve_long_text_format(
259+
self,
260+
response_format: Union[AudioFormat, str],
261+
auto_combine: bool,
262+
) -> Union[AudioFormat, str]:
263+
if not auto_combine:
264+
return response_format
265+
266+
fmt_value = self._normalise_format_value(response_format)
267+
try:
268+
fmt_enum = AudioFormat(fmt_value)
269+
except ValueError:
270+
return AudioFormat.WAV
271+
272+
if fmt_enum is AudioFormat.MP3:
273+
return AudioFormat.WAV
274+
275+
return response_format
276+
243277
async def generate_speech_from_long_text(
244278
self,
245279
text: str,

ttsfm/client.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,18 +317,50 @@ def generate_speech_long_text(
317317
responses = self.generate_speech_batch(
318318
text=text,
319319
voice=voice,
320-
response_format=response_format,
320+
response_format=self._resolve_long_text_format(response_format, auto_combine),
321321
instructions=instructions,
322322
max_length=max_length,
323323
preserve_words=preserve_words,
324324
**kwargs
325325
)
326326

327327
if auto_combine:
328-
return combine_responses(responses)
328+
combined = combine_responses(responses)
329+
original_format = self._normalise_format_value(response_format)
330+
if combined.metadata is None:
331+
combined.metadata = {}
332+
combined.metadata.setdefault("actual_format", combined.format.value)
333+
if original_format != combined.format.value:
334+
combined.metadata["original_requested_format"] = original_format
335+
return combined
329336

330337
return responses
331338

339+
@staticmethod
340+
def _normalise_format_value(response_format: Union[AudioFormat, str]) -> str:
341+
if isinstance(response_format, AudioFormat):
342+
return response_format.value
343+
return str(response_format).lower()
344+
345+
def _resolve_long_text_format(
346+
self,
347+
response_format: Union[AudioFormat, str],
348+
auto_combine: bool,
349+
) -> Union[AudioFormat, str]:
350+
if not auto_combine:
351+
return response_format
352+
353+
fmt_value = self._normalise_format_value(response_format)
354+
try:
355+
fmt_enum = AudioFormat(fmt_value)
356+
except ValueError:
357+
return AudioFormat.WAV
358+
359+
if fmt_enum is AudioFormat.MP3:
360+
return AudioFormat.WAV
361+
362+
return response_format
363+
332364
def _make_request(self, request: TTSRequest) -> TTSResponse:
333365
"""
334366
Make the actual HTTP request to the openai.fm TTS service.

0 commit comments

Comments
 (0)