-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaugment.py
More file actions
113 lines (86 loc) · 3.52 KB
/
augment.py
File metadata and controls
113 lines (86 loc) · 3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Audio augmentation pipeline for wake word training.
Fast numpy-based augmentations (no librosa phase vocoder).
"""
import numpy as np
from scipy.signal import resample_poly, butter, sosfilt
from scipy.interpolate import interp1d
def time_stretch(audio, rate):
"""Stretch/compress time via linear interpolation. Fast."""
indices = np.arange(0, len(audio), rate)
indices = indices[indices < len(audio) - 1]
interp = interp1d(np.arange(len(audio)), audio)
return interp(indices).astype(np.float32)
def pitch_shift(audio, semitones):
"""Shift pitch by resampling. Fast approximation."""
ratio = 2.0 ** (semitones / 12.0)
# Resample to change pitch, then stretch back to original length
up = max(int(round(ratio * 100)), 1)
down = 100
shifted = resample_poly(audio, up, down).astype(np.float32)
# Stretch back to original length
if len(shifted) == len(audio):
return shifted
indices = np.linspace(0, len(shifted) - 1, len(audio))
interp = interp1d(np.arange(len(shifted)), shifted)
return interp(indices).astype(np.float32)
def add_noise(audio, snr_db):
"""Add Gaussian noise at a given SNR."""
signal_power = np.mean(audio ** 2)
noise_power = signal_power / (10 ** (snr_db / 10))
noise = np.random.randn(len(audio)) * np.sqrt(max(noise_power, 1e-10))
return (audio + noise).astype(np.float32)
def random_gain(audio, min_db=-6, max_db=6):
"""Apply random gain in dB."""
db = np.random.uniform(min_db, max_db)
return audio * (10 ** (db / 20))
def time_shift(audio, max_frac=0.1):
"""Shift audio left/right, zero-pad the gap."""
shift = int(np.random.uniform(-max_frac, max_frac) * len(audio))
result = np.zeros_like(audio)
if shift > 0:
result[shift:] = audio[:-shift]
elif shift < 0:
result[:shift] = audio[-shift:]
else:
result = audio.copy()
return result
def speed_perturb(audio, factor):
"""Change speed (pitch + tempo together). Very fast."""
indices = np.linspace(0, len(audio) - 1, int(len(audio) / factor))
indices = np.clip(indices, 0, len(audio) - 1)
interp = interp1d(np.arange(len(audio)), audio)
return interp(indices).astype(np.float32)
def low_pass(audio, sr, cutoff_hz):
"""Low-pass filter to simulate muted/distant speech."""
sos = butter(4, cutoff_hz, btype='low', fs=sr, output='sos')
return sosfilt(sos, audio).astype(np.float32)
def augment_one(audio, sr):
"""Apply a random combination of augmentations to one sample."""
aug = audio.copy()
if np.random.random() < 0.5:
rate = np.random.uniform(0.9, 1.1)
aug = time_stretch(aug, rate)
if np.random.random() < 0.5:
semitones = np.random.uniform(-2, 2)
aug = pitch_shift(aug, semitones)
if np.random.random() < 0.3:
factor = np.random.uniform(0.9, 1.1)
aug = speed_perturb(aug, factor)
if np.random.random() < 0.7:
snr = np.random.uniform(10, 30)
aug = add_noise(aug, snr)
if np.random.random() < 0.7:
aug = random_gain(aug, min_db=-6, max_db=6)
if np.random.random() < 0.3:
aug = time_shift(aug, max_frac=0.1)
return aug
def pad_or_trim(audio, target_len):
"""Center-pad with zeros or center-trim to exact length."""
if len(audio) >= target_len:
start = (len(audio) - target_len) // 2
return audio[start:start + target_len]
pad_total = target_len - len(audio)
pad_left = pad_total // 2
pad_right = pad_total - pad_left
return np.pad(audio, (pad_left, pad_right))