Improve audio combining and text splitting logic

dbccccccc · dbccccccc · commit d81a04a3d907 · 2025-09-18T09:59:02.000+08:00
Updated audio combining in app.py to use format-aware decoding for non-WAV formats, fixing OPUS/AAC/FLAC/PCM support. Enhanced split_text_by_length in utils.py to handle oversized sentences and extremely long words with a new fallback splitter. Ensured WebSocketTTSHandler properly closes TTSClient to prevent resource leaks. Clarified auto-combine feature in README files and added tests for new text splitting and audio combining behaviors.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ TTSFM provides both synchronous and asynchronous Python clients for text-to-spee
 - 🔧 **CLI Tool** - Command-line interface for quick TTS generation
 - 📦 **Type Hints** - Full type annotation support for better IDE experience
 - 🛡️ **Error Handling** - Comprehensive exception hierarchy with retry logic
-- ✨ **Auto-Combine** - Automatically handles long text with seamless audio combining
+- ✨ **Auto-Combine** (Web API) - Docker/OpenAI-compatible endpoint can split and merge long text for you
 - 📊 **Text Validation** - Automatic text length validation and splitting
 - 🔐 **API Key Protection** - Optional OpenAI-compatible authentication for secure deployments
 
diff --git a/README.zh.md b/README.zh.md
@@ -26,7 +26,7 @@ TTSFM为文本转语音生成提供同步和异步Python客户端，使用逆向
 - 🔧 **CLI工具** - 用于快速TTS生成的命令行界面
 - 📦 **类型提示** - 完整的类型注解支持，提供更好的IDE体验
 - 🛡️ **错误处理** - 全面的异常层次结构和重试逻辑
-- ✨ **自动合并** - 自动处理长文本，无缝音频合并
+- ✨ **自动合并（Web API）** - Docker / OpenAI 兼容端点可自动分割并合并长文本
 - 📊 **文本验证** - 自动文本长度验证和分割
 - 🔐 **API密钥保护** - 可选的OpenAI兼容身份验证，用于安全部署
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -14,6 +14,25 @@ def test_split_text_preserves_sentence_punctuation():
     assert chunks[2].endswith("."), chunks
 
 
+def test_split_text_handles_oversized_sentence():
+    long_sentence = " ".join(["word"] * 600)
+    chunks = utils.split_text_by_length(long_sentence, max_length=120)
+
+    assert all(len(chunk) <= 120 for chunk in chunks)
+    assert sum(len(chunk.split()) for chunk in chunks) == 600
+
+
+def test_split_text_handles_extremely_long_word():
+    max_length = 50
+    painful_word = "a" * 140
+    text = f"start {painful_word} end"
+
+    chunks = utils.split_text_by_length(text, max_length=max_length)
+
+    assert any(painful_word[:max_length] in chunk for chunk in chunks)
+    assert all(len(chunk) <= max_length for chunk in chunks)
+
+
 def test_sanitize_text_retains_ampersands():
     text = "R&D and Fish & Chips &amp; Co. <b>Bold</b>"
     sanitized = utils.sanitize_text(text)
diff --git a/tests/test_web_app.py b/tests/test_web_app.py
@@ -45,6 +45,46 @@ def test_voices_endpoint_returns_data(monkeypatch):
     assert payload['count'] == len(payload['voices'])
 
 
+def test_combine_audio_chunks_uses_format_hint(monkeypatch):
+    module = load_web_app(monkeypatch, REQUIRE_API_KEY='false', TTSFM_API_KEY=None)
+
+    class DummySegment:
+        def __init__(self, tag: str):
+            self.tag = tag
+
+        def __iadd__(self, other: "DummySegment"):
+            self.tag += other.tag
+            return self
+
+        def export(self, buffer, format: str):
+            buffer.write(f"{format}:{self.tag}".encode())
+
+    class DummyAudioSegment:
+        formats = []
+
+        @classmethod
+        def from_mp3(cls, buffer):
+            cls.formats.append("mp3")
+            return DummySegment("mp3")
+
+        @classmethod
+        def from_wav(cls, buffer):
+            cls.formats.append("wav")
+            return DummySegment("wav")
+
+        @classmethod
+        def from_file(cls, buffer, format: str):
+            cls.formats.append(format)
+            return DummySegment(format)
+
+    monkeypatch.setattr(module, "AudioSegment", DummyAudioSegment)
+
+    output = module.combine_audio_chunks([b"one", b"two"], "opus")
+
+    assert output == b"opus:opusopus"
+    assert DummyAudioSegment.formats == ["opus", "opus"]
+
+
 @pytest.mark.parametrize('header_name, header_value', [
     ('Authorization', 'Bearer super-secret'),
     ('X-API-Key', 'super-secret'),
diff --git a/ttsfm-web/app.py b/ttsfm-web/app.py
@@ -292,10 +292,9 @@ def combine_audio_chunks(audio_chunks: List[bytes], format_type: str = "mp3") ->
                 segment = AudioSegment.from_mp3(buffer)
             elif fmt == "wav":
                 segment = AudioSegment.from_wav(buffer)
-            elif fmt == "opus":
-                segment = AudioSegment.from_wav(buffer)
             else:
-                segment = AudioSegment.from_file(buffer)
+                # OPUS/FLAC/AAC/PCM all require an explicit decoder hint
+                segment = AudioSegment.from_file(buffer, format=fmt)
             audio_segments.append(segment)
 
         combined = audio_segments[0]
diff --git a/ttsfm-web/websocket_handler.py b/ttsfm-web/websocket_handler.py
@@ -235,6 +235,10 @@ def _generate_stream(self, session_id: str, request_id: str, data: Dict[str, Any
             logger.error(f"Stream generation failed: {str(e)}")
             self._emit_error(session_id, request_id, str(e))
         finally:
+            try:
+                client.close()
+            except Exception as exc:  # pragma: no cover - defensive cleanup
+                logger.debug("Failed to close TTS client cleanly: %s", exc)
             self._remove_task(session_id, request_id)
 
     def _emit_error(self, session_id: str, request_id: str, error_message: str):
diff --git a/ttsfm/utils.py b/ttsfm/utils.py
@@ -227,6 +227,61 @@ def _split_into_sentences(text: str) -> List[str]:
     return sentences
 
 
+def _split_long_segment(segment: str, max_length: int) -> List[str]:
+    """Fallback splitter for oversized segments."""
+    if len(segment) <= max_length:
+        return [segment]
+
+    parts: List[str] = []
+    words = segment.split()
+
+    if not words:
+        for i in range(0, len(segment), max_length):
+            chunk = segment[i:i + max_length]
+            if chunk.strip():
+                parts.append(chunk)
+        return parts
+
+    current_words: List[str] = []
+    current_len = 0
+
+    for word in words:
+        word_len = len(word)
+
+        if word_len > max_length:
+            if current_words:
+                parts.append(' '.join(current_words))
+                current_words = []
+                current_len = 0
+
+            for i in range(0, word_len, max_length):
+                chunk = word[i:i + max_length]
+                if chunk.strip():
+                    parts.append(chunk)
+            continue
+
+        separator = 1 if current_words else 0
+        proposed = current_len + word_len + separator
+
+        if proposed <= max_length:
+            if separator:
+                current_len += 1
+            current_words.append(word)
+            current_len += word_len
+            continue
+
+        if current_words:
+            parts.append(' '.join(current_words))
+
+        current_words = [word]
+        current_len = word_len
+
+    if current_words:
+        parts.append(' '.join(current_words))
+
+    return parts
+
+
 def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool = True) -> List[str]:
     """Split text into chunks no longer than ``max_length`` characters."""
     if not text:
@@ -236,6 +291,8 @@ def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool
         return [text]
 
     chunks: List[str] = []
+    effective_max = max(1, max_length)
+    tolerance = min(32, max(8, effective_max // 10))
 
     if preserve_words:
         sentences = _split_into_sentences(text)
@@ -257,8 +314,8 @@ def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool
             if current_segment:
                 chunks.append(' '.join(current_segment))
 
-            if len(sentence) > max_length:
-                chunks.append(sentence)
+            if len(sentence) > effective_max + tolerance:
+                chunks.extend(_split_long_segment(sentence, max_length))
                 current_segment = []
                 current_length = 0
                 continue