Project-N-E-K-O · rophec · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/.gitignore b/.gitignore
@@ -197,3 +197,10 @@ tests/testbench/docs/dev_note.txt
 
 # yui-origin Live2D 模型源码打包在 assets/yui-origin.tar.gz；build_frontend 解压到此
 /static/yui-origin/
+
+# Local lightweight TTS model files are large runtime assets. Keep directory
+# placeholders in git, but do not commit downloaded ONNX / voice data files.
+local_server/local_tts_server/kokoro_models/*
+!local_server/local_tts_server/kokoro_models/.gitkeep
+.uv-cache-local/
+.venv-local-tts/
diff --git a/local_server/local_tts_server/README.md b/local_server/local_tts_server/README.md
@@ -0,0 +1,106 @@
+# NEKO Local Lightweight TTS
+
+This service is the first-phase local TTS bridge for NEKO. It intentionally
+keeps the same WebSocket protocol expected by `local_cosyvoice_worker`, so NEKO
+can use it without changing the main TTS pipeline.
+
+## Protocol
+
+Endpoint:
+
+```text
+ws://127.0.0.1:50000/v1/audio/speech/stream
+```
+
+Client messages:
+
+```json
+{"voice":"kokoro:zf_001","speed":1.0}
+{"text":"Hello from NEKO."}
+{"text":"Local Kokoro TTS test."}
+{"event":"end"}
+```
+
+Server response:
+
+```text
+binary PCM s16le chunks, mono, 22050 Hz
+```
+
+NEKO's existing `local_cosyvoice_worker` then resamples this audio to 48 kHz.
+
+## Start
+
+From the repository root:
+
+```bash
+uv run python local_server/local_tts_server/server.py --host 127.0.0.1 --port 50000
+```
+
+In NEKO settings, use the existing local custom TTS path:
+
+```text
+ws://127.0.0.1:50000
+```
+
+Enable custom TTS and set the URL to `ws://` or `wss://`. WebSocket custom TTS
+routes to `local_cosyvoice_worker` directly; the HTTP GPT-SoVITS path is only
+used for `http://` or `https://` custom TTS URLs.
+
+## Voice Selector
+
+The service accepts a model prefix in `voice`:
+
+```text
+kokoro:<voice>
+melotts:<voice>
+melo:<voice>
+chattts:<voice>
+```
+
+If the prefix is missing, `LOCAL_TTS_DEFAULT_MODEL` is used. The default is
+`kokoro`.
+
+## Kokoro / MeloTTS / ChatTTS
+
+These are exposed through command adapters for now. The command must write a
+16-bit WAV file to `{out_file}`.
+
+The Kokoro launcher defaults to the Chinese-enhanced
+`hexgrad/Kokoro-82M-v1.1-zh` model and voice `zf_001`.
+It expects a local model directory under
+`local_server/local_tts_server/kokoro_models/Kokoro-82M-v1.1-zh` or an explicit
+`LOCAL_TTS_KOKORO_MODEL_DIR`. Hugging Face auto-download is disabled by the
+launcher so model provenance stays user-managed.
+
+Kokoro voices are static `.pt` files under the selected model directory. This
+server exposes `/v1/voices` for discovery and the shared
+`/v1/audio/speech/stream` WebSocket for synthesis, but it does not implement
+CosyVoice-style speaker cloning or registration. Calls to
+`/v1/speakers/register` return a clear unsupported response instead of creating
+local voice metadata.
+
+### Windows examples
+
+```powershell
+set LOCAL_TTS_KOKORO_MODEL_DIR=F:\models\Kokoro-82M-v1.1-zh
+set LOCAL_TTS_KOKORO_REPO_ID=hexgrad/Kokoro-82M-v1.1-zh
+set LOCAL_TTS_KOKORO_DEFAULT_VOICE=zf_001
+set LOCAL_TTS_KOKORO_CMD=python F:\tts_wrappers\kokoro_cli.py "{text_file}" "{out_file}" "{voice}" {speed}
+set LOCAL_TTS_MELOTTS_CMD=python F:\tts_wrappers\melotts_cli.py --text-file "{text_file}" --out "{out_file}" --voice "{voice}" --speed {speed}
+set LOCAL_TTS_CHATTTS_CMD=python F:\tts_wrappers\chattts_cli.py --text-file "{text_file}" --out "{out_file}" --voice "{voice}" --speed {speed}
+```
+
+### Linux / macOS examples
+
+```bash
+export LOCAL_TTS_KOKORO_MODEL_DIR=/models/Kokoro-82M-v1.1-zh
+export LOCAL_TTS_KOKORO_REPO_ID=hexgrad/Kokoro-82M-v1.1-zh
+export LOCAL_TTS_KOKORO_DEFAULT_VOICE=zf_001
+export LOCAL_TTS_KOKORO_CMD='python /opt/tts_wrappers/kokoro_cli.py "{text_file}" "{out_file}" "{voice}" {speed}'
+export LOCAL_TTS_MELOTTS_CMD='python /opt/tts_wrappers/melotts_cli.py --text-file "{text_file}" --out "{out_file}" --voice "{voice}" --speed {speed}'
+export LOCAL_TTS_CHATTTS_CMD='python /opt/tts_wrappers/chattts_cli.py --text-file "{text_file}" --out "{out_file}" --voice "{voice}" --speed {speed}'
+```
+
+ChatTTS is AGPL-3.0. Keep it as an optional external backend unless the product
+licensing story is settled.
diff --git a/local_server/local_tts_server/kokoro_cli.py b/local_server/local_tts_server/kokoro_cli.py
@@ -0,0 +1,172 @@
+"""Minimal Kokoro v1.1-zh CLI wrapper for local_tts_server.
+
+Usage:
+    python kokoro_cli.py <text_file> <out_file> <voice> <speed>
+
+Reads text from <text_file>, synthesizes with kokoro, writes WAV to <out_file>.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import wave
+from pathlib import Path
+
+import numpy as np
+
+from local_tts_profiles import (
+    DEFAULT_KOKORO_REPO_ID,
+    DEFAULT_KOKORO_VOICE,
+    available_kokoro_voices,
+    find_kokoro_model_file,
+    resolve_kokoro_model_dir,
+    resolve_kokoro_voice_file,
+)
+
+SAMPLE_RATE = 24000
+
+
+def _audio_from_result(result):
+    if hasattr(result, "audio"):
+        return result.audio
+    if isinstance(result, tuple) and result:
+        return result[-1]
+    return None
+
+
+def _infer_lang_code(voice: str) -> str:
+    # Kokoro uses single-letter lang codes: z=zh, a=en-us, b=en-gb.
+    if voice.startswith(("a", "af", "am")):
+        return "a"
+    if voice.startswith(("b", "bf", "bm")):
+        return "b"
+    return "z"
+
+
+def _speed_callable(base_speed: float):
+    """Mitigate rushed long Chinese phoneme sequences in v1.1-zh."""
+
+    base = base_speed if base_speed > 0 else 1.0
+
+    def speed_by_len(len_ps: int) -> float:
+        speed = 1.0
+        if len_ps > 83 and len_ps < 183:
+            speed = 1.0 - (len_ps - 83) / 500.0
+        elif len_ps >= 183:
+            speed = 0.8
+        return max(0.5, speed * base)
+
+    return speed_by_len
+
+
+def synthesize(text_path: str, out_path: str, voice: str, speed: float) -> int:
+    try:
+        import torch
+        from kokoro import KModel, KPipeline
+    except ImportError:
+        print(
+            'kokoro v1.1-zh deps missing. Run: uv pip install "kokoro>=0.8.2" "misaki[zh]>=0.8.2"',
+            file=sys.stderr,
+        )
+        return 1
+
+    text = Path(text_path).read_text(encoding="utf-8").strip()
+    if not text:
+        print("Empty text file", file=sys.stderr)
+        return 1
+
+    model_dir = resolve_kokoro_model_dir()
+    repo_id = os.getenv("LOCAL_TTS_KOKORO_REPO_ID", DEFAULT_KOKORO_REPO_ID).strip() or DEFAULT_KOKORO_REPO_ID
+    voice = (voice or "").strip() or os.getenv("LOCAL_TTS_KOKORO_DEFAULT_VOICE", DEFAULT_KOKORO_VOICE)
+    available_voices = set(available_kokoro_voices(model_dir))
+    if available_voices and voice not in available_voices:
+        fallback_voice = os.getenv("LOCAL_TTS_KOKORO_DEFAULT_VOICE", DEFAULT_KOKORO_VOICE).strip() or DEFAULT_KOKORO_VOICE
+        if fallback_voice not in available_voices:
+            fallback_voice = sorted(available_voices)[0]
+        print(
+            f"Kokoro voice '{voice}' not found in local model dir; falling back to '{fallback_voice}'.",
+            file=sys.stderr,
+        )
+        voice = fallback_voice
+    pipeline_voice = resolve_kokoro_voice_file(voice, model_dir)
+    lang = _infer_lang_code(voice)
+    device = os.getenv("LOCAL_TTS_KOKORO_DEVICE", "").strip()
+    if not device:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    if model_dir:
+        config_path = model_dir / "config.json"
+        model_path = find_kokoro_model_file(model_dir)
+        if not config_path.is_file() or model_path is None:
+            print(
+                f"Invalid LOCAL_TTS_KOKORO_MODEL_DIR: {model_dir} "
+                "(expected config.json and a .pth model file)",
+                file=sys.stderr,
+            )
+            return 1
+        model = KModel(repo_id=repo_id, config=str(config_path), model=str(model_path)).to(device).eval()
+    else:
+        model = KModel(repo_id=repo_id).to(device).eval()
+
+    en_pipeline = None
+    en_callable = None
+    if lang == "z":
+        en_pipeline = KPipeline(lang_code="a", repo_id=repo_id, model=False)
+
+        def en_callable(text_part: str):
+            if text_part == "Kokoro":
+                return "kˈOkəɹO"
+            if text_part == "Sol":
+                return "sˈOl"
+            return next(en_pipeline(text_part)).phonemes
+
+    pipeline = KPipeline(
+        lang_code=lang,
+        repo_id=repo_id,
+        model=model,
+        en_callable=en_callable,
+    )
+    effective_speed = _speed_callable(speed) if lang == "z" else speed
+    generator = pipeline(text, voice=pipeline_voice, speed=effective_speed)
+
+    chunks: list[np.ndarray] = []
+    for result in generator:
+        audio = _audio_from_result(result)
+        if audio is not None:
+            chunks.append(np.asarray(audio, dtype=np.float32))
+
+    if not chunks:
+        print("No audio generated", file=sys.stderr)
+        return 1
+
+    pcm = np.concatenate(chunks)
+    pcm = np.clip(pcm, -1.0, 1.0)
+    pcm_int16 = (pcm * 32767.0).astype(np.int16)
+
+    with wave.open(out_path, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(SAMPLE_RATE)
+        wf.writeframes(pcm_int16.tobytes())
+
+    print(
+        f"Wrote {out_path}: {len(pcm_int16)} samples @ {SAMPLE_RATE} Hz "
+        f"repo={repo_id} model_dir={model_dir or '<hf-cache>'} voice={voice} device={device}"
+    )
+    return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Kokoro CLI wrapper for local_tts")
+    parser.add_argument("text_file")
+    parser.add_argument("out_file")
+    parser.add_argument("voice")
+    parser.add_argument("speed", type=float)
+    args = parser.parse_args()
+    return synthesize(args.text_file, args.out_file, args.voice, args.speed)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/local_server/local_tts_server/kokoro_models/.gitkeep b/local_server/local_tts_server/kokoro_models/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/local_server/local_tts_server/local_tts_profiles.py b/local_server/local_tts_server/local_tts_profiles.py
@@ -0,0 +1,88 @@
+"""Provider-bound helpers for the lightweight local TTS server."""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+
+DEFAULT_MODEL = "kokoro"
+DEFAULT_VOICE = "default"
+DEFAULT_KOKORO_REPO_ID = "hexgrad/Kokoro-82M-v1.1-zh"
+DEFAULT_KOKORO_VOICE = "zf_001"
+KOKORO_ZH_MODEL_DIR_NAME = "Kokoro-82M-v1.1-zh"
+KOKORO_MODEL_FILE_NAME = "kokoro-v1_1-zh.pth"
+
+
+@dataclass(frozen=True)
+class VoiceSpec:
+    """Parsed lightweight local TTS voice selector.
+
+    Accepted examples:
+    - "kokoro:zf_001"
+    - "melotts:zh"
+    - "chattts:default"
+    - "zf_001" -> uses LOCAL_TTS_DEFAULT_MODEL
+    """
+
+    model: str
+    voice: str
+
+
+def local_tts_server_dir() -> Path:
+    return Path(__file__).resolve().parent
+
+
+def kokoro_models_root() -> Path:
+    return local_tts_server_dir() / "kokoro_models"
+
+
+def default_kokoro_model_dir() -> Path:
+    return kokoro_models_root() / KOKORO_ZH_MODEL_DIR_NAME
+
+
+def parse_local_tts_voice(raw_voice: str) -> VoiceSpec:
+    default_model = os.getenv("LOCAL_TTS_DEFAULT_MODEL", DEFAULT_MODEL).strip().lower() or DEFAULT_MODEL
+    value = (raw_voice or "").strip()
+    if ":" not in value:
+        return VoiceSpec(default_model, value or DEFAULT_VOICE)
+    model, voice = value.split(":", 1)
+    model = model.strip().lower() or default_model
+    return VoiceSpec(model, voice.strip() or DEFAULT_VOICE)
+
+
+def resolve_kokoro_model_dir() -> Path | None:
+    raw = os.getenv("LOCAL_TTS_KOKORO_MODEL_DIR", "").strip()
+    if raw:
+        path = Path(raw)
+        return path if path.is_dir() else None
+    path = default_kokoro_model_dir()
+    return path if path.is_dir() else None
+
+
+def find_kokoro_model_file(model_dir: Path | None) -> Path | None:
+    if model_dir is None:
+        return None
+    preferred = model_dir / KOKORO_MODEL_FILE_NAME
+    if preferred.is_file():
+        return preferred
+    candidates = sorted(model_dir.glob("*.pth"))
+    return candidates[0] if candidates else None
+
+
+def available_kokoro_voices(model_dir: Path | None = None) -> list[str]:
+    model_dir = model_dir if model_dir is not None else resolve_kokoro_model_dir()
+    if model_dir is None:
+        return []
+    voices_dir = model_dir / "voices"
+    if not voices_dir.is_dir():
+        return []
+    return sorted(path.stem for path in voices_dir.glob("*.pt") if path.is_file())
+
+
+def resolve_kokoro_voice_file(voice: str, model_dir: Path | None = None) -> str:
+    if model_dir is None or voice.endswith(".pt"):
+        return voice
+    local_voice = model_dir / "voices" / f"{voice}.pt"
+    return str(local_voice) if local_voice.is_file() else voice