Skip to content

Commit 1a36e6f

Browse files
feat(elevenlabs-say): add ElevenLabs say-style TTS CLI (#328)
## What A new uv-packaged Python CLI at `packages/elevenlabs-say/` that mirrors macOS `say`, backed by the ElevenLabs text-to-speech API. Run it with `nix run .#elevenlabs-say`. Behavior: - Text source precedence: positional `TEXT`, then `--file/-f`, then stdin (when stdin is not a TTY). A clear error when no text is available from any source. - Default action plays audio through the speakers with `ffplay` (from `ffmpeg`, put on PATH by the Nix wrapper). `--output/-o PATH` saves the audio instead. - `--voice NAME|ID` resolves a name to an id via `voices.search`, otherwise uses the value as a literal id. Default voice is Rachel (`21m00Tcm4TlvDq8ikWAM`), a premade voice present on every account. - `--model` defaults to `eleven_flash_v2_5`; `--format` defaults to `mp3_44100_128`. - The API key comes from `ELEVENLABS_API_KEY`. If unset, the CLI prints an actionable error and exits non-zero. No embedded key, no silent fallback. ## Why Python ElevenLabs ships an official, typed Python SDK (`elevenlabs`) and has no official Rust SDK, so a thin CLI over the SDK is the lowest-maintenance owner. This repo already has first-class uv packaging through `ix.buildUvApplication` and a worked example (`examples/python-daily-scraper`), but no standalone TS CLI precedent. The only runtime dependency is the ElevenLabs SDK; everything else (`argparse`, `subprocess`, `tempfile`) is stdlib. ## Files - `package.nix`: discovery metadata (`packageSet`/`flake`). - `pyproject.toml`: `elevenlabs>=2.50.0,<3.0.0`, `requires-python = ">=3.13"`, `uv_build` backend, `elevenlabs-say` console script. - `uv.lock`: committed for a pure Nix build. - `src/elevenlabs_say/__init__.py`: the CLI, fully type-annotated for `ty` standard mode. - `default.nix`: `ix.buildUvApplication` then a `runCommand` + `makeWrapper` that puts `ffmpeg` on PATH (the `packages/run` pattern), with a `passthru.tests.printsHelp` smoke test that asserts `--help` exits 0 and prints usage, with no network and no key. - `README.md`: task-first setup and usage. ## Example usage ```sh export ELEVENLABS_API_KEY=sk_... nix run .#elevenlabs-say -- "the first move sets everything in motion" echo "hello from index" | nix run .#elevenlabs-say nix run .#elevenlabs-say -- "save me" --output /tmp/out.mp3 nix run .#elevenlabs-say -- "different voice" --voice Adam ``` ## Validation - `nix build .#elevenlabs-say` succeeds, including the default `ty` type check ("All checks passed!") with no type-check knobs needed. - `./result/bin/elevenlabs-say --help` prints usage and exits 0. - `nix build .#elevenlabs-say.tests.printsHelp` succeeds (offline smoke test). - No-key path: `ELEVENLABS_API_KEY` unset prints a clear error and exits 1. - `nix run .#lint` passes for the new files. - Live synth was skipped: no ElevenLabs API key exists in Vaultwarden (`ix-infra`), so there was no key to exercise a real conversion.
1 parent 94ff6f4 commit 1a36e6f

6 files changed

Lines changed: 708 additions & 0 deletions

File tree

packages/elevenlabs-say/README.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# elevenlabs-say
2+
3+
A `say`-style command-line tool that speaks text with the [ElevenLabs](https://elevenlabs.io)
4+
text-to-speech API. It reads text from an argument, a file, or stdin, then plays
5+
the audio through your speakers or writes it to a file.
6+
7+
## Setup
8+
9+
Set your ElevenLabs API key in the environment. The CLI reads it from
10+
`ELEVENLABS_API_KEY` and exits with an error if it is unset.
11+
12+
```sh
13+
export ELEVENLABS_API_KEY=sk_...
14+
```
15+
16+
## Usage
17+
18+
```sh
19+
# Speak a string through the speakers.
20+
nix run .#elevenlabs-say -- "the first move sets everything in motion"
21+
22+
# Speak the contents of a file.
23+
nix run .#elevenlabs-say -- --file notes.txt
24+
25+
# Speak text piped on stdin.
26+
echo "hello from index" | nix run .#elevenlabs-say
27+
28+
# Save audio instead of playing it.
29+
nix run .#elevenlabs-say -- "save me" --output /tmp/out.mp3
30+
31+
# Pick a voice by name or id, and override the model or format.
32+
nix run .#elevenlabs-say -- "different voice" --voice Adam
33+
nix run .#elevenlabs-say -- "slower model" --model eleven_multilingual_v2 --format mp3_44100_192
34+
```
35+
36+
Text source precedence is positional argument, then `--file`, then stdin.
37+
38+
## Defaults
39+
40+
- Voice: Rachel (`21m00Tcm4TlvDq8ikWAM`), a premade voice on every account. A
41+
`--voice` value that matches a voice name is resolved to its id; otherwise it
42+
is used as a literal id.
43+
- Model: `eleven_flash_v2_5`, chosen for low latency.
44+
- Format: `mp3_44100_128`.
45+
46+
## Playback
47+
48+
Playback shells out to `ffplay` from `ffmpeg`, which the Nix wrapper puts on
49+
PATH. `--output` skips playback and writes the audio bytes directly.
50+
51+
## Known limitations
52+
53+
- Playback needs a working audio device. On a headless host use `--output` to
54+
capture the audio instead.
55+
- A name that collides with a 20-character voice id would resolve as a name
56+
first. ElevenLabs voice ids are opaque tokens, so this does not happen in
57+
practice.
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
{
2+
ix,
3+
lib,
4+
pkgs,
5+
}:
6+
7+
let
8+
fs = lib.fileset;
9+
src = fs.toSource {
10+
root = ./.;
11+
fileset = fs.unions [
12+
./pyproject.toml
13+
./src
14+
./uv.lock
15+
];
16+
};
17+
18+
unwrapped = ix.buildUvApplication pkgs {
19+
pname = "elevenlabs-say";
20+
version = "0.1.0";
21+
inherit src;
22+
mainProgram = "elevenlabs-say";
23+
# pydantic-core and websockets ship binary wheels that dlopen libstdc++ at
24+
# import time on Linux, the same constraint the daily-scraper example handles.
25+
runtimeLibraryInputs = [ pkgs.stdenv.cc.cc.lib ];
26+
meta = {
27+
description = "A say-style ElevenLabs text-to-speech CLI";
28+
license = lib.licenses.mit;
29+
mainProgram = "elevenlabs-say";
30+
};
31+
};
32+
33+
package =
34+
pkgs.runCommand "elevenlabs-say"
35+
{
36+
nativeBuildInputs = [ pkgs.makeWrapper ];
37+
strictDeps = true;
38+
meta = {
39+
description = "A say-style ElevenLabs text-to-speech CLI";
40+
license = lib.licenses.mit;
41+
mainProgram = "elevenlabs-say";
42+
};
43+
}
44+
''
45+
mkdir -p $out/bin
46+
# ffmpeg supplies ffplay, which playback shells out to. afplay is
47+
# macOS-only and absent from nixpkgs, so ffplay is the portable choice.
48+
makeWrapper ${lib.getExe unwrapped} $out/bin/elevenlabs-say \
49+
--prefix PATH : ${lib.makeBinPath [ pkgs.ffmpeg ]}
50+
'';
51+
52+
printsHelp =
53+
pkgs.runCommand "elevenlabs-say-prints-help"
54+
{
55+
nativeBuildInputs = [ package ];
56+
strictDeps = true;
57+
}
58+
''
59+
# No network and no API key: --help must exit 0 and print usage.
60+
help=$(elevenlabs-say --help)
61+
case "$help" in
62+
*"usage: elevenlabs-say"*) ;;
63+
*)
64+
echo "elevenlabs-say --help did not print usage" >&2
65+
printf '%s\n' "$help" >&2
66+
exit 1
67+
;;
68+
esac
69+
mkdir -p "$out"
70+
'';
71+
in
72+
package.overrideAttrs (old: {
73+
passthru = (old.passthru or { }) // {
74+
tests = {
75+
inherit printsHelp;
76+
};
77+
};
78+
})
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
id = "elevenlabs-say";
3+
packageSet = true;
4+
flake = true;
5+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[project]
2+
name = "elevenlabs-say"
3+
version = "0.1.0"
4+
description = "A say-style ElevenLabs text-to-speech CLI"
5+
requires-python = ">=3.13"
6+
dependencies = [
7+
"elevenlabs>=2.50.0,<3.0.0",
8+
]
9+
10+
[project.scripts]
11+
elevenlabs-say = "elevenlabs_say:main"
12+
13+
[build-system]
14+
requires = ["uv_build>=0.11.0,<0.12.0"]
15+
build-backend = "uv_build"
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
"""A say-style ElevenLabs text-to-speech CLI.
2+
3+
Reads text from a positional argument, a file, or stdin, synthesizes speech with
4+
the ElevenLabs API, and either plays it through the speakers with ``ffplay`` or
5+
writes the audio to a file. The API key comes from ``ELEVENLABS_API_KEY``; there
6+
is no embedded key and no silent fallback.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import argparse
12+
import os
13+
import subprocess
14+
import sys
15+
import tempfile
16+
from dataclasses import dataclass
17+
from pathlib import Path
18+
19+
from elevenlabs import ElevenLabs
20+
from elevenlabs.core import ApiError
21+
22+
# Rachel is a stable ElevenLabs premade voice that is available on every account,
23+
# so it is a safe default for a `say` replacement.
24+
# https://elevenlabs.io/docs/api-reference/voices/get
25+
DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"
26+
DEFAULT_MODEL_ID = "eleven_flash_v2_5"
27+
DEFAULT_OUTPUT_FORMAT = "mp3_44100_128"
28+
29+
API_KEY_ENV = "ELEVENLABS_API_KEY"
30+
31+
32+
class SayError(Exception):
33+
"""An operator-facing failure with an actionable message."""
34+
35+
36+
@dataclass(frozen=True)
37+
class CliArgs:
38+
text: str | None
39+
file: Path | None
40+
output: Path | None
41+
voice: str
42+
model: str
43+
output_format: str
44+
45+
46+
def parse_args(argv: list[str] | None = None) -> CliArgs:
47+
parser = argparse.ArgumentParser(
48+
prog="elevenlabs-say",
49+
description="Synthesize speech with ElevenLabs and play it or save it to a file.",
50+
)
51+
_ = parser.add_argument(
52+
"text",
53+
nargs="?",
54+
default=None,
55+
help="Text to speak. Omit to read from --file or stdin.",
56+
)
57+
_ = parser.add_argument(
58+
"-f",
59+
"--file",
60+
type=Path,
61+
default=None,
62+
help="Read text from this file instead of the positional argument.",
63+
)
64+
_ = parser.add_argument(
65+
"-o",
66+
"--output",
67+
type=Path,
68+
default=None,
69+
help="Write audio to this file instead of playing it.",
70+
)
71+
_ = parser.add_argument(
72+
"--voice",
73+
default=DEFAULT_VOICE_ID,
74+
help=(
75+
"Voice name or id. A value that matches a voice name is resolved to "
76+
f"its id; otherwise it is used verbatim. Defaults to Rachel ({DEFAULT_VOICE_ID})."
77+
),
78+
)
79+
_ = parser.add_argument(
80+
"--model",
81+
default=DEFAULT_MODEL_ID,
82+
help=f"Model id. Defaults to {DEFAULT_MODEL_ID}.",
83+
)
84+
_ = parser.add_argument(
85+
"--format",
86+
dest="output_format",
87+
default=DEFAULT_OUTPUT_FORMAT,
88+
help=f"Output audio format. Defaults to {DEFAULT_OUTPUT_FORMAT}.",
89+
)
90+
namespace = parser.parse_args(argv)
91+
92+
text: str | None = namespace.text
93+
file: Path | None = namespace.file
94+
output: Path | None = namespace.output
95+
voice: str = namespace.voice
96+
model: str = namespace.model
97+
output_format: str = namespace.output_format
98+
99+
return CliArgs(
100+
text=text,
101+
file=file,
102+
output=output,
103+
voice=voice,
104+
model=model,
105+
output_format=output_format,
106+
)
107+
108+
109+
def read_text(args: CliArgs) -> str:
110+
"""Resolve the text to speak: positional arg, then --file, then stdin."""
111+
if args.text is not None:
112+
source = args.text
113+
elif args.file is not None:
114+
try:
115+
source = args.file.read_text(encoding="utf-8")
116+
except OSError as exc:
117+
raise SayError(f"cannot read text file {args.file}: {exc}") from exc
118+
elif not sys.stdin.isatty():
119+
source = sys.stdin.read()
120+
else:
121+
raise SayError(
122+
"no text to speak: pass TEXT, use --file PATH, or pipe text on stdin"
123+
)
124+
125+
text = source.strip()
126+
if not text:
127+
raise SayError("no text to speak: the resolved text is empty")
128+
return text
129+
130+
131+
def make_client() -> ElevenLabs:
132+
if not os.environ.get(API_KEY_ENV):
133+
raise SayError(
134+
f"{API_KEY_ENV} is not set; export your ElevenLabs API key, "
135+
f"for example: export {API_KEY_ENV}=sk_..."
136+
)
137+
return ElevenLabs()
138+
139+
140+
def resolve_voice_id(client: ElevenLabs, voice: str) -> str:
141+
"""Treat ``voice`` as a name first; fall back to using it as an id verbatim.
142+
143+
ElevenLabs voice ids are opaque 20-character tokens, so a human-typed name
144+
almost never collides with an id. Searching by name keeps the CLI usable with
145+
friendly voice names while still accepting a raw id.
146+
"""
147+
try:
148+
response = client.voices.search(search=voice)
149+
except ApiError as exc:
150+
raise SayError(format_api_error("resolve voice", exc)) from exc
151+
152+
for candidate in response.voices:
153+
if candidate.name is not None and candidate.name.casefold() == voice.casefold():
154+
return candidate.voice_id
155+
156+
# No name match: use the supplied value as a literal voice id.
157+
return voice
158+
159+
160+
def synthesize(client: ElevenLabs, text: str, args: CliArgs, voice_id: str) -> bytes:
161+
try:
162+
chunks = client.text_to_speech.convert(
163+
voice_id=voice_id,
164+
text=text,
165+
model_id=args.model,
166+
output_format=args.output_format,
167+
)
168+
return b"".join(chunks)
169+
except ApiError as exc:
170+
raise SayError(format_api_error("synthesize speech", exc)) from exc
171+
172+
173+
def format_api_error(action: str, exc: ApiError) -> str:
174+
if exc.status_code is not None:
175+
return f"failed to {action}: ElevenLabs API returned status {exc.status_code}: {exc.body}"
176+
return f"failed to {action}: {exc.body}"
177+
178+
179+
def write_output(audio: bytes, output: Path) -> None:
180+
try:
181+
_ = output.write_bytes(audio)
182+
except OSError as exc:
183+
raise SayError(f"cannot write audio to {output}: {exc}") from exc
184+
185+
186+
def play(audio: bytes) -> None:
187+
"""Play MP3 bytes through the speakers with ``ffplay``.
188+
189+
``ffplay`` is provided by ``ffmpeg``, which the Nix wrapper puts on PATH. It
190+
is the cross-platform, Nix-pinnable counterpart to macOS ``afplay``.
191+
"""
192+
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as handle:
193+
temp_path = Path(handle.name)
194+
_ = handle.write(audio)
195+
try:
196+
completed = subprocess.run(
197+
[
198+
"ffplay",
199+
"-nodisp",
200+
"-autoexit",
201+
"-loglevel",
202+
"error",
203+
str(temp_path),
204+
],
205+
check=False,
206+
)
207+
if completed.returncode != 0:
208+
raise SayError(f"ffplay exited with status {completed.returncode}")
209+
except FileNotFoundError as exc:
210+
raise SayError(
211+
"ffplay was not found on PATH; install ffmpeg to play audio, "
212+
"or use --output PATH to save the audio instead"
213+
) from exc
214+
finally:
215+
temp_path.unlink(missing_ok=True)
216+
217+
218+
def run(args: CliArgs) -> None:
219+
text = read_text(args)
220+
client = make_client()
221+
voice_id = resolve_voice_id(client, args.voice)
222+
audio = synthesize(client, text, args, voice_id)
223+
224+
if args.output is not None:
225+
write_output(audio, args.output)
226+
print(f"wrote {args.output}", file=sys.stderr)
227+
else:
228+
play(audio)
229+
230+
231+
def main() -> None:
232+
args = parse_args()
233+
try:
234+
run(args)
235+
except SayError as exc:
236+
print(f"elevenlabs-say: {exc}", file=sys.stderr)
237+
raise SystemExit(1) from exc

0 commit comments

Comments
 (0)