-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstt_check.py
More file actions
74 lines (56 loc) · 2.07 KB
/
stt_check.py
File metadata and controls
74 lines (56 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Transcribe wake word samples using pywhispercpp for quality checking.
Requires the stt venv (pywhispercpp). Auto-detects ../stt/.venv or uses current python.
Usage: python stt_check.py [files or directories...]
Examples:
python stt_check.py train_data/positive/pos1_16k.wav
python stt_check.py train_data/negative/
python stt_check.py train_data/positive/ train_data/negative/
"""
import os
import subprocess
import sys
import wave
from pathlib import Path
_DIR = Path(__file__).resolve().parent
_STT_PYTHON = _DIR.parent / "stt" / ".venv" / "bin" / "python"
if _STT_PYTHON.exists() and Path(sys.executable).resolve() != _STT_PYTHON.resolve():
os.execv(str(_STT_PYTHON), [str(_STT_PYTHON)] + sys.argv)
import numpy as np
from pywhispercpp.model import Model
INITIAL_PROMPT = (
"Hey Eliezer. Hello Eliezer. Eliezer Yudkowsky is an AI researcher. "
"Hey everyone. Hey Google. Hey Siri. Hey Alexa. "
"Eliezer, hi. Good morning Eliezer. Yo Eliezer. "
"Hey Elizabeth. Hey Oliver. Hey Alicia. "
"The name is Eliezer, sometimes spelled Eliezer. "
)
def load_wav(path):
with wave.open(str(path), "rb") as wf:
sr = wf.getframerate()
frames = wf.readframes(wf.getnframes())
audio = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
return audio, sr
def main():
args = sys.argv[1:] or ["train_data/positive/", "train_data/negative/"]
files = []
for arg in args:
p = Path(arg)
if p.is_dir():
files.extend(sorted(p.glob("*.wav")))
elif p.is_file():
files.append(p)
else:
print(f"Not found: {arg}", file=sys.stderr)
if not files:
print("No WAV files found.", file=sys.stderr)
sys.exit(1)
model = Model("base.en")
for f in files:
audio, sr = load_wav(f)
dur = len(audio) / sr
segments = model.transcribe(audio, initial_prompt=INITIAL_PROMPT)
text = " ".join(s.text.strip() for s in segments).strip()
print(f" {str(f):50s} ({dur:.2f}s) {text}")
if __name__ == "__main__":
main()