Add centralized audio combining and auto-combine

dbccccccc · dbccccccc · commit 9f64f300b2a4 · 2025-09-18T11:01:36.000+08:00
Introduces ttsfm/audio.py with reusable audio chunk combining logic and a new combine_responses helper. Adds auto_combine support to both sync and async clients and CLI, enabling single-file output for long text. Updates documentation and tests to cover the new behavior, and bumps version to 3.3.0-alpha3.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.3.0-alpha3] - 2025-09-18
+
+### Added
+- Centralised audio chunk combining in `ttsfm/audio.py`, including the reusable `combine_responses` helper for both core and web flows.
+- `auto_combine=True` support in the synchronous/asynchronous clients and CLI delivers a single audio file for long text (pydub still optional for non-WAV output).
+- Regression tests (`tests/test_clients.py`) covering the new combination paths.
+
+### Changed
+- Long-text splitting now falls back to word-level chunks with a small tolerance so punctuation stays intact while respecting `max_length` limits.
+
+### Documentation
+- README (EN/ZH) highlights the Python auto-combine option and CLI flag; `AI_NOTES.md` captures the refreshed test instructions.
+
+### Testing
+- Added regression coverage for the audio helper refactor and client auto-combine behaviour; `pytest` commands documented for follow-up runs.
+
 ## [3.3.0-alpha2] - 2025-09-18
 
 ### Changed
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ TTSFM provides both synchronous and asynchronous Python clients for text-to-spee
 - 🔧 **CLI Tool** - Command-line interface for quick TTS generation
 - 📦 **Type Hints** - Full type annotation support for better IDE experience
 - 🛡️ **Error Handling** - Comprehensive exception hierarchy with retry logic
-- ✨ **Auto-Combine** (Web API) - Docker/OpenAI-compatible endpoint can split and merge long text for you
+- ✨ **Auto-Combine** - Web/OpenAI endpoints merge long text automatically; Python client can opt-in with `auto_combine=True`
 - 📊 **Text Validation** - Automatic text length validation and splitting
 - 🔐 **API Key Protection** - Optional OpenAI-compatible authentication for secure deployments
 
@@ -168,11 +168,22 @@ responses = client.generate_speech_long_text(
     preserve_words=True
 )
 
-# Save each chunk as separate files
 for i, response in enumerate(responses, 1):
-    response.save_to_file(f"part_{i:03d}")  # Saves as part_001.mp3, part_002.mp3, etc.
+    response.save_to_file(f"part_{i:03d}")
 
 print(f"Generated {len(responses)} audio files from long text")
+
+# Or combine everything into a single response (requires pydub for non-WAV formats)
+combined = client.generate_speech_long_text(
+    text="Very long text that exceeds 4096 characters...",
+    voice=Voice.ALLOY,
+    response_format=AudioFormat.MP3,
+    max_length=2000,
+    preserve_words=True,
+    auto_combine=True,
+)
+
+combined.save_to_file("long_text")  # Saves as long_text.mp3
 ```
 
 #### OpenAI Python Client Compatibility
@@ -256,6 +267,9 @@ ttsfm --text-file input.txt --output speech.mp3
 # Custom service URL
 ttsfm "Hello, world!" --url http://localhost:7000 --output hello.mp3
 
+# Auto-combine long text into a single file
+ttsfm --text-file article.txt --output article.mp3 --split-long-text --auto-combine
+
 # List available voices
 ttsfm --list-voices
 
diff --git a/README.zh.md b/README.zh.md
@@ -26,7 +26,7 @@ TTSFM为文本转语音生成提供同步和异步Python客户端，使用逆向
 - 🔧 **CLI工具** - 用于快速TTS生成的命令行界面
 - 📦 **类型提示** - 完整的类型注解支持，提供更好的IDE体验
 - 🛡️ **错误处理** - 全面的异常层次结构和重试逻辑
-- ✨ **自动合并（Web API）** - Docker / OpenAI 兼容端点可自动分割并合并长文本
+- ✨ **自动合并** - Web/OpenAI 端点自动处理长文本；Python 客户端可通过 `auto_combine=True` 合并音频
 - 📊 **文本验证** - 自动文本长度验证和分割
 - 🔐 **API密钥保护** - 可选的OpenAI兼容身份验证，用于安全部署
 
@@ -161,6 +161,18 @@ for i, response in enumerate(responses, 1):
     response.save_to_file(f"part_{i:03d}")  # 保存为part_001.mp3、part_002.mp3等
 
 print(f"从长文本生成了 {len(responses)} 个音频文件")
+
+# 或合并为单个音频（非WAV格式需要安装pydub）
+combined = client.generate_speech_long_text(
+    text="超过4096字符的很长文本...",
+    voice=Voice.ALLOY,
+    response_format=AudioFormat.MP3,
+    max_length=2000,
+    preserve_words=True,
+    auto_combine=True
+)
+
+combined.save_to_file("long_text")  # 保存为 long_text.mp3
 ```
 
 #### OpenAI Python客户端兼容性
@@ -244,6 +256,9 @@ ttsfm --text-file input.txt --output speech.mp3
 # 自定义服务URL
 ttsfm "你好，世界！" --url http://localhost:7000 --output hello.mp3
 
+# 自动合并长文本并生成单个音频
+ttsfm --text-file article.txt --output article.mp3 --split-long-text --auto-combine
+
 # 列出可用声音
 ttsfm --list-voices
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
 version_scheme = "no-guess-dev"
 local_scheme = "no-local-version"
 
-fallback_version = "3.3.0-alpha2"
+fallback_version = "3.3.0-alpha3"
 [tool.setuptools]
 packages = ["ttsfm"]
 
diff --git a/tests/test_clients.py b/tests/test_clients.py
@@ -0,0 +1,75 @@
+import pytest
+
+from ttsfm.client import TTSClient
+from ttsfm.async_client import AsyncTTSClient
+from ttsfm.models import TTSResponse, AudioFormat
+
+
+def _mk_response(data: bytes) -> TTSResponse:
+    return TTSResponse(
+        audio_data=data,
+        content_type="audio/mpeg",
+        format=AudioFormat.MP3,
+        size=len(data),
+    )
+
+
+def test_sync_long_text_auto_combine(monkeypatch):
+    client = TTSClient()
+
+    monkeypatch.setattr(
+        client,
+        "generate_speech_batch",
+        lambda **kwargs: [_mk_response(b"one"), _mk_response(b"two")],
+    )
+
+    combined_flag = {}
+
+    def fake_combine(responses):
+        combined_flag["called"] = True
+        return _mk_response(b"onetwo")
+
+    monkeypatch.setattr("ttsfm.client.combine_responses", fake_combine)
+
+    result = client.generate_speech_long_text(
+        text="dummy",
+        auto_combine=True,
+    )
+
+    assert combined_flag["called"] is True
+    assert isinstance(result, TTSResponse)
+    assert result.audio_data == b"onetwo"
+
+
+def test_sync_long_text_returns_list_without_auto_combine(monkeypatch):
+    client = TTSClient()
+
+    responses = [_mk_response(b"one")]
+    monkeypatch.setattr(client, "generate_speech_batch", lambda **_: responses)
+
+    result = client.generate_speech_long_text(text="dummy", auto_combine=False)
+
+    assert result is responses
+
+
+@pytest.mark.asyncio
+async def test_async_long_text_auto_combine(monkeypatch):
+    client = AsyncTTSClient()
+
+    async def fake_batch(**kwargs):
+        return [_mk_response(b"one"), _mk_response(b"two")]
+
+    monkeypatch.setattr(client, "generate_speech_batch", fake_batch)
+
+    def fake_combine(responses):
+        return _mk_response(b"onetwo")
+
+    monkeypatch.setattr("ttsfm.async_client.combine_responses", fake_combine)
+
+    result = await client.generate_speech_long_text(
+        text="dummy",
+        auto_combine=True,
+    )
+
+    assert isinstance(result, TTSResponse)
+    assert result.audio_data == b"onetwo"
diff --git a/tests/test_web_app.py b/tests/test_web_app.py
@@ -46,7 +46,9 @@ def test_voices_endpoint_returns_data(monkeypatch):
 
 
 def test_combine_audio_chunks_uses_format_hint(monkeypatch):
-    module = load_web_app(monkeypatch, REQUIRE_API_KEY='false', TTSFM_API_KEY=None)
+    load_web_app(monkeypatch, REQUIRE_API_KEY='false', TTSFM_API_KEY=None)
+
+    from ttsfm import audio as audio_module
 
     class DummySegment:
         def __init__(self, tag: str):
@@ -77,9 +79,9 @@ def from_file(cls, buffer, format: str):
             cls.formats.append(format)
             return DummySegment(format)
 
-    monkeypatch.setattr(module, "AudioSegment", DummyAudioSegment)
+    monkeypatch.setattr(audio_module, "AudioSegment", DummyAudioSegment)
 
-    output = module.combine_audio_chunks([b"one", b"two"], "opus")
+    output = audio_module.combine_audio_chunks([b"one", b"two"], "opus")
 
     assert output == b"opus:opusopus"
     assert DummyAudioSegment.formats == ["opus", "opus"]
diff --git a/ttsfm-web/app.py b/ttsfm-web/app.py
@@ -29,13 +29,15 @@
 # Import the TTSFM package
 try:
     from ttsfm import TTSClient, Voice, AudioFormat, TTSException
+    from ttsfm.audio import combine_audio_chunks
     from ttsfm.exceptions import APIException, NetworkException, ValidationException
     from ttsfm.utils import validate_text_length, split_text_by_length
 except ImportError:
     # Fallback for development when package is not installed
     import sys
     sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
     from ttsfm import TTSClient, Voice, AudioFormat, TTSException
+    from ttsfm.audio import combine_audio_chunks
     from ttsfm.exceptions import APIException, NetworkException, ValidationException
     from ttsfm.utils import validate_text_length, split_text_by_length
 
@@ -265,96 +267,6 @@ def _chunk_bytes(data: bytes, chunk_size: int = 64 * 1024) -> Iterator[bytes]:
         yield bytes(view[offset:offset + chunk_size])
 
 
-try:
-    from pydub import AudioSegment  # type: ignore
-except ImportError:  # pragma: no cover - optional dependency
-    AudioSegment = None  # type: ignore
-
-
-def combine_audio_chunks(audio_chunks: List[bytes], format_type: str = "mp3") -> bytes:
-    """Combine multiple audio chunks into a single audio file."""
-    if not audio_chunks:
-        return b''
-
-    fmt = format_type.lower()
-
-    if AudioSegment is None and fmt != "wav":
-        raise RuntimeError("Combining audio requires pydub for non-WAV formats. Install ttsfm[web].")
-
-    try:
-        if AudioSegment is None:
-            return _simple_wav_concatenation(audio_chunks)
-
-        audio_segments = []
-        for chunk in audio_chunks:
-            buffer = io.BytesIO(chunk)
-            if fmt == "mp3":
-                segment = AudioSegment.from_mp3(buffer)
-            elif fmt == "wav":
-                segment = AudioSegment.from_wav(buffer)
-            else:
-                # OPUS/FLAC/AAC/PCM all require an explicit decoder hint
-                segment = AudioSegment.from_file(buffer, format=fmt)
-            audio_segments.append(segment)
-
-        combined = audio_segments[0]
-        for segment in audio_segments[1:]:
-            combined += segment
-
-        output_buffer = io.BytesIO()
-        export_format = fmt if fmt in {"mp3", "wav", "aac", "flac", "opus", "pcm"} else "wav"
-        combined.export(output_buffer, format=export_format)
-        return output_buffer.getvalue()
-    except Exception as exc:
-        logger.error("Error combining audio chunks: %s", exc)
-        raise
-
-def _simple_wav_concatenation(wav_chunks: List[bytes]) -> bytes:
-    """
-    Simple WAV file concatenation without external dependencies.
-    This is a basic implementation that works for simple WAV files.
-    """
-    if not wav_chunks:
-        return b''
-
-    if len(wav_chunks) == 1:
-        return wav_chunks[0]
-
-    try:
-        # For WAV files, we can do a simple concatenation by:
-        # 1. Taking the header from the first file
-        # 2. Concatenating all the audio data
-        # 3. Updating the file size in the header
-
-        first_wav = wav_chunks[0]
-        if len(first_wav) < 44:  # WAV header is at least 44 bytes
-            return b''.join(wav_chunks)
-
-        # Extract header from first file (first 44 bytes)
-        header = bytearray(first_wav[:44])
-
-        # Collect all audio data (skip headers for subsequent files)
-        audio_data = first_wav[44:]  # Audio data from first file
-
-        for wav_chunk in wav_chunks[1:]:
-            if len(wav_chunk) > 44:
-                audio_data += wav_chunk[44:]  # Skip header, append audio data
-
-        # Update file size in header (bytes 4-7)
-        total_size = len(header) + len(audio_data) - 8
-        header[4:8] = total_size.to_bytes(4, byteorder='little')
-
-        # Update data chunk size in header (bytes 40-43)
-        data_size = len(audio_data)
-        header[40:44] = data_size.to_bytes(4, byteorder='little')
-
-        return bytes(header) + audio_data
-
-    except Exception as e:
-        logger.error(f"Error in simple WAV concatenation: {e}")
-        # Ultimate fallback
-        return b''.join(wav_chunks)
-
 def _is_safe_url(target: Optional[str]) -> bool:
     """Validate that a target URL is safe for redirection.
 
@@ -787,7 +699,7 @@ def get_status():
         return jsonify({
             "status": "online",
             "tts_service": "openai.fm (free)",
-            "package_version": "3.3.0-alpha2",
+            "package_version": "3.3.0-alpha3",
             "timestamp": datetime.now().isoformat()
         })
         
@@ -805,7 +717,7 @@ def health_check():
     """Simple health check endpoint."""
     return jsonify({
         "status": "healthy",
-        "package_version": "3.3.0-alpha2",
+        "package_version": "3.3.0-alpha3",
         "timestamp": datetime.now().isoformat()
     })
 
@@ -1109,4 +1021,3 @@ def internal_error(error):
     finally:
         logger.info("TTSFM web application shut down")
 
-
diff --git a/ttsfm-web/templates/base.html b/ttsfm-web/templates/base.html
@@ -88,7 +88,7 @@
             <a class="navbar-brand" href="{{ url_for('index') }}">
                 <i class="fas fa-microphone-alt me-2"></i>
                 <span class="fw-bold">TTSFM</span>
-                <span class="badge bg-primary ms-2 small">v3.3.0-alpha2</span>
+                <span class="badge bg-primary ms-2 small">v3.3.0-alpha3</span>
             </a>
 
             <button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
@@ -159,7 +159,7 @@
                     <div class="d-flex align-items-center">
                         <i class="fas fa-microphone-alt me-2 text-primary"></i>
                         <strong class="text-dark">TTSFM</strong>
-                        <span class="ms-2 text-muted">v3.3.0-alpha2</span>
+                        <span class="ms-2 text-muted">v3.3.0-alpha3</span>
                     </div>
                 </div>
                 <div class="col-md-6 text-md-end">
diff --git a/ttsfm/__init__.py b/ttsfm/__init__.py
@@ -57,12 +57,13 @@
     QuotaExceededException,
     AudioProcessingException
 )
+from .audio import combine_audio_chunks, combine_responses
 from .utils import (
     validate_text_length,
     split_text_by_length
 )
 
-__version__ = "3.3.0-alpha2"
+__version__ = "3.3.0-alpha3"
 __author__ = "dbcccc"
 __email__ = "120614547+dbccccccc@users.noreply.github.com"
 __description__ = "Text-to-Speech API Client with OpenAI compatibility"
@@ -124,7 +125,7 @@ def generate_speech(text: str, voice: str = "alloy", **kwargs) -> bytes:
 
     return default_client.generate_speech(text=text, voice=voice, **kwargs)
 
-def generate_speech_long_text(text: str, voice: str = "alloy", **kwargs) -> list:
+def generate_speech_long_text(text: str, voice: str = "alloy", **kwargs):
     """
     Convenience function to generate speech from long text using the default client.
 
@@ -183,6 +184,8 @@ def generate_speech_long_text(text: str, voice: str = "alloy", **kwargs) -> list
     # Utility functions
     "validate_text_length",
     "split_text_by_length",
+    "combine_audio_chunks",
+    "combine_responses",
     
     # Package metadata
     "__version__",
diff --git a/ttsfm/async_client.py b/ttsfm/async_client.py
diff --git a/ttsfm/audio.py b/ttsfm/audio.py
diff --git a/ttsfm/cli.py b/ttsfm/cli.py
diff --git a/ttsfm/client.py b/ttsfm/client.py