ddsp_simplified/feature_extraction.py at master · raraz15/ddsp_simplified · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np

from dsp_utils.spectral_ops import compute_loudness, compute_f0, compute_mfcc, compute_logmel

from utilities import concat_dct, frame_generator

## -------------------------------------------------- Feature Extraction ---------------------------------------


def feature_extractor(audio, sample_rate=16000, model=None, frame_rate=250,
                    f0=True, loudness=True, mfcc=False, log_mel=False,
                    mfcc_nfft=1024, l_nfft=2048, logmel_nfft=2048,
                    conf_threshold=0.0):
    """
    mfcc_nfft should be determined by preprocessing timesteps.
    l_nfft and log_mel nfft are used as here in the library."""

    features = {'audio': audio}

    if f0:
        f0_hz, confidence = compute_f0(audio, sample_rate, frame_rate, viterbi=True)
        f0_hz = confidence_filter(f0_hz, confidence, conf_threshold)
        features['f0_hz'] = f0_hz

    if mfcc:
        # overlap and fft_size taken from the code
        # overlap is the same except for frame size 63
        features['mfcc'] = compute_mfcc(audio,
                                        fft_size=mfcc_nfft,
                                        overlap=0.75,
                                        mel_bins=128,
                                        mfcc_bins=30)

    if log_mel:
        features['log_mel'] = compute_logmel(audio,
                                            bins=229, #64
                                            fft_size=logmel_nfft,
                                            overlap=0.75,
                                            pad_end=True,
                                            sample_rate=sample_rate)

    if loudness:
        # apply reverb before l extraction to match
        # room acoustics for timbre transfer
        if model is not None and model.add_reverb:
            audio = model.reverb({"audio_synth":audio[np.newaxis,:]})[0]

        features['loudness_db'] = compute_loudness(audio,
                                                    sample_rate=sample_rate,
                                                    frame_rate=frame_rate,
                                                    n_fft=l_nfft,
                                                    use_tf=False)
    return features

def extract_features_from_frames(frames, **kwargs):
    """Extracts features from multiple frames and concatenates them."""
    return concat_dct([feature_extractor(frame, **kwargs) for frame in frames])

def process_track(track, sample_rate=16000, audio_length=60, frame_size=64000, **kwargs):
    """Generates frames from a track and extracts features for each frame."""
    MAX_AUDIO_LENGTH = sample_rate*audio_length
    if len(track) > MAX_AUDIO_LENGTH: # trim from the end
        track = track[:MAX_AUDIO_LENGTH]
    frames = frame_generator(track, frame_size) # large chunks of audio
    return extract_features_from_frames(frames, **kwargs)

def confidence_filter(F0, confidence, threshold):
    """
    Silences the time instants where the model confidence is below the given threshold.
    """

    return [f if confidence[idx] >= threshold else 0.0 for idx, f in enumerate(F0)]