Skip to content

Commit d81a04a

Browse files
committed
Improve audio combining and text splitting logic
Updated audio combining in app.py to use format-aware decoding for non-WAV formats, fixing OPUS/AAC/FLAC/PCM support. Enhanced split_text_by_length in utils.py to handle oversized sentences and extremely long words with a new fallback splitter. Ensured WebSocketTTSHandler properly closes TTSClient to prevent resource leaks. Clarified auto-combine feature in README files and added tests for new text splitting and audio combining behaviors.
1 parent eb38073 commit d81a04a

File tree

7 files changed

+126
-7
lines changed

7 files changed

+126
-7
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ TTSFM provides both synchronous and asynchronous Python clients for text-to-spee
2626
- 🔧 **CLI Tool** - Command-line interface for quick TTS generation
2727
- 📦 **Type Hints** - Full type annotation support for better IDE experience
2828
- 🛡️ **Error Handling** - Comprehensive exception hierarchy with retry logic
29-
-**Auto-Combine** - Automatically handles long text with seamless audio combining
29+
-**Auto-Combine** (Web API) - Docker/OpenAI-compatible endpoint can split and merge long text for you
3030
- 📊 **Text Validation** - Automatic text length validation and splitting
3131
- 🔐 **API Key Protection** - Optional OpenAI-compatible authentication for secure deployments
3232

README.zh.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ TTSFM为文本转语音生成提供同步和异步Python客户端,使用逆向
2626
- 🔧 **CLI工具** - 用于快速TTS生成的命令行界面
2727
- 📦 **类型提示** - 完整的类型注解支持,提供更好的IDE体验
2828
- 🛡️ **错误处理** - 全面的异常层次结构和重试逻辑
29-
-**自动合并** - 自动处理长文本,无缝音频合并
29+
-**自动合并(Web API)** - Docker / OpenAI 兼容端点可自动分割并合并长文本
3030
- 📊 **文本验证** - 自动文本长度验证和分割
3131
- 🔐 **API密钥保护** - 可选的OpenAI兼容身份验证,用于安全部署
3232

tests/test_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,25 @@ def test_split_text_preserves_sentence_punctuation():
1414
assert chunks[2].endswith("."), chunks
1515

1616

17+
def test_split_text_handles_oversized_sentence():
18+
long_sentence = " ".join(["word"] * 600)
19+
chunks = utils.split_text_by_length(long_sentence, max_length=120)
20+
21+
assert all(len(chunk) <= 120 for chunk in chunks)
22+
assert sum(len(chunk.split()) for chunk in chunks) == 600
23+
24+
25+
def test_split_text_handles_extremely_long_word():
26+
max_length = 50
27+
painful_word = "a" * 140
28+
text = f"start {painful_word} end"
29+
30+
chunks = utils.split_text_by_length(text, max_length=max_length)
31+
32+
assert any(painful_word[:max_length] in chunk for chunk in chunks)
33+
assert all(len(chunk) <= max_length for chunk in chunks)
34+
35+
1736
def test_sanitize_text_retains_ampersands():
1837
text = "R&D and Fish & Chips &amp; Co. <b>Bold</b>"
1938
sanitized = utils.sanitize_text(text)

tests/test_web_app.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,46 @@ def test_voices_endpoint_returns_data(monkeypatch):
4545
assert payload['count'] == len(payload['voices'])
4646

4747

48+
def test_combine_audio_chunks_uses_format_hint(monkeypatch):
49+
module = load_web_app(monkeypatch, REQUIRE_API_KEY='false', TTSFM_API_KEY=None)
50+
51+
class DummySegment:
52+
def __init__(self, tag: str):
53+
self.tag = tag
54+
55+
def __iadd__(self, other: "DummySegment"):
56+
self.tag += other.tag
57+
return self
58+
59+
def export(self, buffer, format: str):
60+
buffer.write(f"{format}:{self.tag}".encode())
61+
62+
class DummyAudioSegment:
63+
formats = []
64+
65+
@classmethod
66+
def from_mp3(cls, buffer):
67+
cls.formats.append("mp3")
68+
return DummySegment("mp3")
69+
70+
@classmethod
71+
def from_wav(cls, buffer):
72+
cls.formats.append("wav")
73+
return DummySegment("wav")
74+
75+
@classmethod
76+
def from_file(cls, buffer, format: str):
77+
cls.formats.append(format)
78+
return DummySegment(format)
79+
80+
monkeypatch.setattr(module, "AudioSegment", DummyAudioSegment)
81+
82+
output = module.combine_audio_chunks([b"one", b"two"], "opus")
83+
84+
assert output == b"opus:opusopus"
85+
assert DummyAudioSegment.formats == ["opus", "opus"]
86+
87+
4888
@pytest.mark.parametrize('header_name, header_value', [
4989
('Authorization', 'Bearer super-secret'),
5090
('X-API-Key', 'super-secret'),

ttsfm-web/app.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -292,10 +292,9 @@ def combine_audio_chunks(audio_chunks: List[bytes], format_type: str = "mp3") ->
292292
segment = AudioSegment.from_mp3(buffer)
293293
elif fmt == "wav":
294294
segment = AudioSegment.from_wav(buffer)
295-
elif fmt == "opus":
296-
segment = AudioSegment.from_wav(buffer)
297295
else:
298-
segment = AudioSegment.from_file(buffer)
296+
# OPUS/FLAC/AAC/PCM all require an explicit decoder hint
297+
segment = AudioSegment.from_file(buffer, format=fmt)
299298
audio_segments.append(segment)
300299

301300
combined = audio_segments[0]

ttsfm-web/websocket_handler.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,10 @@ def _generate_stream(self, session_id: str, request_id: str, data: Dict[str, Any
235235
logger.error(f"Stream generation failed: {str(e)}")
236236
self._emit_error(session_id, request_id, str(e))
237237
finally:
238+
try:
239+
client.close()
240+
except Exception as exc: # pragma: no cover - defensive cleanup
241+
logger.debug("Failed to close TTS client cleanly: %s", exc)
238242
self._remove_task(session_id, request_id)
239243

240244
def _emit_error(self, session_id: str, request_id: str, error_message: str):

ttsfm/utils.py

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,61 @@ def _split_into_sentences(text: str) -> List[str]:
227227
return sentences
228228

229229

230+
def _split_long_segment(segment: str, max_length: int) -> List[str]:
231+
"""Fallback splitter for oversized segments."""
232+
if len(segment) <= max_length:
233+
return [segment]
234+
235+
parts: List[str] = []
236+
words = segment.split()
237+
238+
if not words:
239+
for i in range(0, len(segment), max_length):
240+
chunk = segment[i:i + max_length]
241+
if chunk.strip():
242+
parts.append(chunk)
243+
return parts
244+
245+
current_words: List[str] = []
246+
current_len = 0
247+
248+
for word in words:
249+
word_len = len(word)
250+
251+
if word_len > max_length:
252+
if current_words:
253+
parts.append(' '.join(current_words))
254+
current_words = []
255+
current_len = 0
256+
257+
for i in range(0, word_len, max_length):
258+
chunk = word[i:i + max_length]
259+
if chunk.strip():
260+
parts.append(chunk)
261+
continue
262+
263+
separator = 1 if current_words else 0
264+
proposed = current_len + word_len + separator
265+
266+
if proposed <= max_length:
267+
if separator:
268+
current_len += 1
269+
current_words.append(word)
270+
current_len += word_len
271+
continue
272+
273+
if current_words:
274+
parts.append(' '.join(current_words))
275+
276+
current_words = [word]
277+
current_len = word_len
278+
279+
if current_words:
280+
parts.append(' '.join(current_words))
281+
282+
return parts
283+
284+
230285
def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool = True) -> List[str]:
231286
"""Split text into chunks no longer than ``max_length`` characters."""
232287
if not text:
@@ -236,6 +291,8 @@ def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool
236291
return [text]
237292

238293
chunks: List[str] = []
294+
effective_max = max(1, max_length)
295+
tolerance = min(32, max(8, effective_max // 10))
239296

240297
if preserve_words:
241298
sentences = _split_into_sentences(text)
@@ -257,8 +314,8 @@ def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool
257314
if current_segment:
258315
chunks.append(' '.join(current_segment))
259316

260-
if len(sentence) > max_length:
261-
chunks.append(sentence)
317+
if len(sentence) > effective_max + tolerance:
318+
chunks.extend(_split_long_segment(sentence, max_length))
262319
current_segment = []
263320
current_length = 0
264321
continue

0 commit comments

Comments
 (0)