diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..b5c46ee20 Binary files /dev/null and b/.DS_Store differ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..30e36e9e1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.formatting.provider": "yapf" +} diff --git a/asteroid/engine/system_kinect_wsj.py b/asteroid/engine/system_kinect_wsj.py new file mode 100644 index 000000000..abe7cf7a2 --- /dev/null +++ b/asteroid/engine/system_kinect_wsj.py @@ -0,0 +1,238 @@ +import torch +import pytorch_lightning as pl +from torch.optim.lr_scheduler import ReduceLROnPlateau +from asteroid_filterbanks.transforms import mag + +from ..utils import flatten_dict + + +class System(pl.LightningModule): + """Base class for deep learning systems. + Contains a model, an optimizer, a loss function, training and validation + dataloaders and learning rate scheduler. + + Note that by default, any PyTorch-Lightning hooks are *not* passed to the model. + If you want to use Lightning hooks, add the hooks to a subclass:: + + class MySystem(System): + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + return self.model.on_train_batch_start(batch, batch_idx, dataloader_idx) + + Args: + model (torch.nn.Module): Instance of model. + optimizer (torch.optim.Optimizer): Instance or list of optimizers. + loss_func (callable): Loss function with signature + (est_targets, targets). + train_loader (torch.utils.data.DataLoader): Training dataloader. + val_loader (torch.utils.data.DataLoader): Validation dataloader. + scheduler (torch.optim.lr_scheduler._LRScheduler): Instance, or list + of learning rate schedulers. Also supports dict or list of dict as + ``{"interval": "step", "scheduler": sched}`` where ``interval=="step"`` + for step-wise schedulers and ``interval=="epoch"`` for classical ones. + config: Anything to be saved with the checkpoints during training. + The config dictionary to re-instantiate the run for example. + + .. note:: By default, ``training_step`` (used by ``pytorch-lightning`` in the + training loop) and ``validation_step`` (used for the validation loop) + share ``common_step``. If you want different behavior for the training + loop and the validation loop, overwrite both ``training_step`` and + ``validation_step`` instead. + + For more info on its methods, properties and hooks, have a look at lightning's docs: + https://pytorch-lightning.readthedocs.io/en/stable/lightning_module.html#lightningmodule-api + """ + + default_monitor: str = "val_loss" + + def __init__( + self, + model, + optimizer, + loss_func, + train_loader, + val_loader=None, + scheduler=None, + config=None, + mask_mixture=True, + ): + super().__init__() + self.model = model + self.optimizer = optimizer + self.loss_func = loss_func + self.mask_mixture = mask_mixture + self.train_loader = train_loader + self.val_loader = val_loader + self.scheduler = scheduler + self.config = {} if config is None else config + # Save lightning's AttributeDict under self.hparams + self.save_hyperparameters(self.config_to_hparams(self.config)) + + def forward(self, *args, **kwargs): + """Applies forward pass of the model. + + Returns: + :class:`torch.Tensor` + """ + return self.model(*args, **kwargs) + + def common_step(self, batch, batch_nb, train=True): + """Common forward step between training and validation. + + The function of this method is to unpack the data given by the loader, + forward the batch through the model and compute the loss. + Pytorch-lightning handles all the rest. + + Args: + batch: the object returned by the loader (a list of torch.Tensor + in most cases) but can be something else. + batch_nb (int): The number of the batch in the epoch. + train (bool): Whether in training mode. Needed only if the training + and validation steps are fundamentally different, otherwise, + pytorch-lightning handles the usual differences. + + Returns: + :class:`torch.Tensor` : The loss value on this batch. + + .. note:: + This is typically the method to overwrite when subclassing + ``System``. If the training and validation steps are somehow + different (except for ``loss.backward()`` and ``optimzer.step()``), + the argument ``train`` can be used to switch behavior. + Otherwise, ``training_step`` and ``validation_step`` can be overwriten. + """ + + inputs, targets, masks = batch + # Take the first channels + inputs = inputs[..., 0] + targets = targets[..., 0] + est_targets = self(inputs) + loss = self.loss_func(est_targets, targets) + return loss + ''' + inputs, targets, masks = self.unpack_data(batch) + embeddings, est_masks = self(inputs) + spec = mag(self.model.encoder(inputs.unsqueeze(1))) + if self.mask_mixture: + est_masks = est_masks * spec.unsqueeze(1) + masks = masks * spec.unsqueeze(1) + loss, loss_dic = self.loss_func( + embeddings, targets, est_src=est_masks, target_src=masks, mix_spec=spec + ) + return loss + ''' + + def training_step(self, batch, batch_nb): + """Pass data through the model and compute the loss. + + Backprop is **not** performed (meaning PL will do it for you). + + Args: + batch: the object returned by the loader (a list of torch.Tensor + in most cases) but can be something else. + batch_nb (int): The number of the batch in the epoch. + + Returns: + torch.Tensor, the value of the loss. + """ + + loss = self.common_step(batch, batch_nb, train=True) + self.log("loss", loss, logger=True) + return loss + + def validation_step(self, batch, batch_nb): + """Need to overwrite PL validation_step to do validation. + + Args: + batch: the object returned by the loader (a list of torch.Tensor + in most cases) but can be something else. + batch_nb (int): The number of the batch in the epoch. + """ + loss = self.common_step(batch, batch_nb, train=False) + self.log("val_loss", loss, on_epoch=True, prog_bar=True) + + def on_validation_epoch_end(self): + """Log hp_metric to tensorboard for hparams selection.""" + hp_metric = self.trainer.callback_metrics.get("val_loss", None) + if hp_metric is not None: + self.trainer.logger.log_metrics({"hp_metric": hp_metric}, + step=self.trainer.global_step) + + def configure_optimizers(self): + """Initialize optimizers, batch-wise and epoch-wise schedulers.""" + if self.scheduler is None: + return self.optimizer + + if not isinstance(self.scheduler, (list, tuple)): + self.scheduler = [self.scheduler] # support multiple schedulers + + epoch_schedulers = [] + for sched in self.scheduler: + if not isinstance(sched, dict): + if isinstance(sched, ReduceLROnPlateau): + sched = { + "scheduler": sched, + "monitor": self.default_monitor + } + epoch_schedulers.append(sched) + else: + sched.setdefault("monitor", self.default_monitor) + sched.setdefault("frequency", 1) + # Backward compat + if sched["interval"] == "batch": + sched["interval"] = "step" + assert sched["interval"] in [ + "epoch", + "step", + ], "Scheduler interval should be either step or epoch" + epoch_schedulers.append(sched) + return [self.optimizer], epoch_schedulers + + def train_dataloader(self): + """Training dataloader""" + return self.train_loader + + def val_dataloader(self): + """Validation dataloader""" + return self.val_loader + + def on_save_checkpoint(self, checkpoint): + """Overwrite if you want to save more things in the checkpoint.""" + checkpoint["training_config"] = self.config + return checkpoint + + def unpack_data(self, batch, EPS=1e-8): + mix, sources, noise = batch + # Take only the first channel + mix = mix[..., 0] + sources = sources[..., 0] + noise = noise[..., 0] + noise = noise.unsqueeze(1) + # Compute magnitude spectrograms and IRM + src_mag_spec = mag(self.model.encoder(sources)) + noise_mag_spec = mag(self.model.encoder(noise)) + noise_mag_spec = noise_mag_spec.unsqueeze(1) + real_mask = src_mag_spec / (noise_mag_spec + + src_mag_spec.sum(1, keepdim=True) + EPS) + # Get the src idx having the maximum energy + binary_mask = real_mask.argmax(1) + return mix, binary_mask, real_mask + + @staticmethod + def config_to_hparams(dic): + """Sanitizes the config dict to be handled correctly by torch + SummaryWriter. It flatten the config dict, converts ``None`` to + ``"None"`` and any list and tuple into torch.Tensors. + + Args: + dic (dict): Dictionary to be transformed. + + Returns: + dict: Transformed dictionary. + """ + dic = flatten_dict(dic) + for k, v in dic.items(): + if v is None: + dic[k] = str(v) + elif isinstance(v, (list, tuple)): + dic[k] = torch.tensor(v) + return dic diff --git a/egs/.DS_Store b/egs/.DS_Store new file mode 100644 index 000000000..6ae4c945b Binary files /dev/null and b/egs/.DS_Store differ diff --git a/egs/kinect-wsj/.DS_Store b/egs/kinect-wsj/.DS_Store new file mode 100644 index 000000000..d3824d52e Binary files /dev/null and b/egs/kinect-wsj/.DS_Store differ diff --git a/egs/kinect-wsj/ConvTasNet/.DS_Store b/egs/kinect-wsj/ConvTasNet/.DS_Store new file mode 100644 index 000000000..05500633d Binary files /dev/null and b/egs/kinect-wsj/ConvTasNet/.DS_Store differ diff --git a/egs/kinect-wsj/ConvTasNet/README.md b/egs/kinect-wsj/ConvTasNet/README.md new file mode 100644 index 000000000..e17edd007 --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/README.md @@ -0,0 +1,2 @@ +# Results +Coming soon diff --git a/egs/kinect-wsj/ConvTasNet/eval.py b/egs/kinect-wsj/ConvTasNet/eval.py new file mode 100644 index 000000000..65bf703f0 --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/eval.py @@ -0,0 +1,143 @@ +import os +import random +import soundfile as sf +import torch +import yaml +import json +import argparse +import pandas as pd +from tqdm import tqdm +from pprint import pprint + +from asteroid.metrics import get_metrics +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid import ConvTasNet +from asteroid.utils import tensors_to_device +from asteroid.dsp.normalization import normalize_estimates +from asteroid.data import KinectWsjMixDataset + +parser = argparse.ArgumentParser() +parser.add_argument("--test_dir", + type=str, + required=True, + help="Test directory including the csv files") +parser.add_argument("--n_src", type=int, default=2) +parser.add_argument( + "--out_dir", + type=str, + required=True, + help="Directory in exp_dir where the eval results" + " will be stored", +) +parser.add_argument("--use_gpu", + type=int, + default=0, + help="Whether to use the GPU for model execution") +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument("--n_save_ex", + type=int, + default=10, + help="Number of audio examples to save, -1 means all") + +compute_metrics = ["si_sdr", "sdr", "sir", "sar", "stoi"] + + +def main(conf): + model_path = os.path.join(conf["exp_dir"], "best_model.pth") + model = ConvTasNet.from_pretrained(model_path) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + test_set = KinectWsjMixDataset(conf["test_dir"], + n_src=conf["n_src"], + segment=None) # Uses all segment length + # Used to reorder sources only + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + + # Randomly choose the indexes of sentences to save. + eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"]) + ex_save_dir = os.path.join(eval_save_dir, "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources, noises = tensors_to_device(test_set[idx], + device=model_device) + mix = mix[..., 0] + sources = sources[..., 0] + #est_sources = model(mix.unsqueeze(0)) + est_sources = model.separate(mix[None, None]) + loss, reordered_sources = loss_func(est_sources, + sources[None], + return_est=True) + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = reordered_sources.squeeze(0).cpu().data.numpy() + # For each utterance, we get a dictionary with the mixture path, + # the input and output metrics + utt_metrics = get_metrics( + mix_np, + sources_np, + est_sources_np, + sample_rate=conf["sample_rate"], + metrics_list=compute_metrics, + ) + utt_metrics["mix_path"] = test_set.mix[idx][0] + est_sources_np_normalized = normalize_estimates(est_sources_np, mix_np) + + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], + conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx), src, + conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np_normalized): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(eval_save_dir, "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in compute_metrics: + input_metric_name = "input_" + metric_name + ldf = all_metrics_df[metric_name] - all_metrics_df[input_metric_name] + final_results[metric_name] = all_metrics_df[metric_name].mean() + final_results[metric_name + "_imp"] = ldf.mean() + + print("Overall metrics :") + pprint(final_results) + + with open(os.path.join(eval_save_dir, "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + main(arg_dic) diff --git a/egs/kinect-wsj/ConvTasNet/local/conf.yml b/egs/kinect-wsj/ConvTasNet/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/ConvTasNet/local/convert_sphere2wav.sh b/egs/kinect-wsj/ConvTasNet/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/ConvTasNet/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/ConvTasNet/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/ConvTasNet/run.sh b/egs/kinect-wsj/ConvTasNet/run.sh new file mode 100755 index 000000000..4afa3fec4 --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/run.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES +out_dir=kinect_wsj # Controls the directory name associated to the evaluation results inside the experiment directory + +# Network config +n_blocks=8 +n_repeats=3 +mask_act=relu +# Training config +epochs=200 +batch_size=32 +num_workers=8 +half_lr=yes +early_stop=yes +# Optim config +optimizer=adam +lr=0.001 +weight_decay=0. +# Data config +sample_rate=16000 +mode=max +n_src=2 +segment=3 + +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=wav${sr_string}k/$mode +dumpdir=data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi + +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=${uuid} +fi + +expdir=exp/train_convtasnet_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py \ + \ + \ + \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --n_src $n_src \ + --sample_rate $sample_rate \ + --epochs $epochs \ + --batch_size $batch_size \ + --num_workers $num_workers \ + --half_lr $half_lr \ + --early_stop $early_stop \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay #--n_blocks $n_blocks \ + #--n_repeats $n_repeats \ + #--mask_act $mask_act \ + #--task $task \ + #--segment $segment + --exp_dir $expdir | tee logs/train_${tag}.log + cp logs/train_${tag}.log $expdir/train.log + + # Get ready to publish + mkdir -p $expdir/publish_dir + echo "kinect_wsj/ConvTasNet" >$expdir/publish_dir/recipe_name.txt +fi + +if [[ $stage -le 4 ]]; then + echo "Stage 4 : Evaluation" + + $python_path eval.py \ + --exp_dir $expdir \ + --test_dir $test_dir \ + --out_dir $out_dir \ + --use_gpu $eval_use_gpu | tee logs/eval_${tag}.log + + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/ConvTasNet/train.py b/egs/kinect-wsj/ConvTasNet/train.py new file mode 100644 index 000000000..79ef3c06e --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/train.py @@ -0,0 +1,127 @@ +import os +import argparse +import json + +import torch +from torch.optim.lr_scheduler import ReduceLROnPlateau +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.models import ConvTasNet +from asteroid.engine.optimizers import make_optimizer +from asteroid.engine.system_kinect_wsj import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr + +from asteroid.data.kinect_wsj import make_dataloaders + +# Keys which are not in the conf.yml file can be added here. +# In the hierarchical dictionary created when parsing, the key `key` can be +# found at dic['main_args'][key] + +# By default train.py will use all available GPUs. The `id` option in run.sh +# will limit the number of available GPUs for train.py . +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") + + +def main(conf): + exp_dir = conf["main_args"]["exp_dir"] + # Define Dataloader + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + conf["masknet"].update({"n_src": conf["data"]["n_src"]}) + + model = ConvTasNet(**conf["filterbank"], + **conf["masknet"], + sample_rate=conf["data"]["sample_rate"]) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + # Define scheduler + scheduler = None + if conf["training"]["half_lr"]: + scheduler = ReduceLROnPlateau(optimizer=optimizer, + factor=0.5, + patience=5) + # Just after instantiating, save the args. Easy loading in the future. + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + + # Define Loss function. + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + system = System( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint(checkpoint_dir, + monitor="val_loss", + mode="min", + save_top_k=5, + verbose=True) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="val_loss", + mode="min", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "ddp" if torch.cuda.is_available() else None + + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=5.0, + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + + state_dict = torch.load(checkpoint.best_model_path) + system.load_state_dict(state_dict=state_dict["state_dict"]) + system.cpu() + + to_save = system.model.serialize() + #to_save.update(train_set.get_infos()) + torch.save(to_save, os.path.join(exp_dir, "best_model.pth")) + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/ConvTasNet/utils/parse_options.sh b/egs/kinect-wsj/ConvTasNet/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/ConvTasNet/utils/prepare_python_env.sh b/egs/kinect-wsj/ConvTasNet/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/ConvTasNet/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file diff --git a/egs/kinect-wsj/DCCRNet/.DS_Store b/egs/kinect-wsj/DCCRNet/.DS_Store new file mode 100644 index 000000000..da30f0f34 Binary files /dev/null and b/egs/kinect-wsj/DCCRNet/.DS_Store differ diff --git a/egs/kinect-wsj/DCCRNet/README.md b/egs/kinect-wsj/DCCRNet/README.md new file mode 100644 index 000000000..e17edd007 --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/README.md @@ -0,0 +1,2 @@ +# Results +Coming soon diff --git a/egs/kinect-wsj/DCCRNet/eval.py b/egs/kinect-wsj/DCCRNet/eval.py new file mode 100644 index 000000000..401b4b59f --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/eval.py @@ -0,0 +1,143 @@ +import os +import random +import soundfile as sf +import torch +import yaml +import json +import argparse +import pandas as pd +from tqdm import tqdm +from pprint import pprint + +from asteroid.metrics import get_metrics +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid import DCCRNet +from asteroid.utils import tensors_to_device +from asteroid.dsp.normalization import normalize_estimates +from asteroid.data import KinectWsjMixDataset + +parser = argparse.ArgumentParser() +parser.add_argument("--test_dir", + type=str, + required=True, + help="Test directory including the csv files") +parser.add_argument("--n_src", type=int, default=2) +parser.add_argument( + "--out_dir", + type=str, + required=True, + help="Directory in exp_dir where the eval results" + " will be stored", +) +parser.add_argument("--use_gpu", + type=int, + default=0, + help="Whether to use the GPU for model execution") +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument("--n_save_ex", + type=int, + default=10, + help="Number of audio examples to save, -1 means all") + +compute_metrics = ["si_sdr", "sdr", "sir", "sar", "stoi"] + + +def main(conf): + model_path = os.path.join(conf["exp_dir"], "best_model.pth") + model = DCCRNet.from_pretrained(model_path) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + test_set = KinectWsjMixDataset(conf["test_dir"], + n_src=conf["n_src"], + segment=None) # Uses all segment length + # Used to reorder sources only + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + + # Randomly choose the indexes of sentences to save. + eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"]) + ex_save_dir = os.path.join(eval_save_dir, "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources, noises = tensors_to_device(test_set[idx], + device=model_device) + mix = mix[..., 0] + sources = sources[..., 0] + #est_sources = model(mix.unsqueeze(0)) + est_sources = model.separate(mix[None, None]) + loss, reordered_sources = loss_func(est_sources, + sources[None], + return_est=True) + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = reordered_sources.squeeze(0).cpu().data.numpy() + # For each utterance, we get a dictionary with the mixture path, + # the input and output metrics + utt_metrics = get_metrics( + mix_np, + sources_np, + est_sources_np, + sample_rate=conf["sample_rate"], + metrics_list=compute_metrics, + ) + utt_metrics["mix_path"] = test_set.mix[idx][0] + est_sources_np_normalized = normalize_estimates(est_sources_np, mix_np) + + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], + conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx), src, + conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np_normalized): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(eval_save_dir, "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in compute_metrics: + input_metric_name = "input_" + metric_name + ldf = all_metrics_df[metric_name] - all_metrics_df[input_metric_name] + final_results[metric_name] = all_metrics_df[metric_name].mean() + final_results[metric_name + "_imp"] = ldf.mean() + + print("Overall metrics :") + pprint(final_results) + + with open(os.path.join(eval_save_dir, "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + main(arg_dic) diff --git a/egs/kinect-wsj/DCCRNet/local/conf.yml b/egs/kinect-wsj/DCCRNet/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/DCCRNet/local/convert_sphere2wav.sh b/egs/kinect-wsj/DCCRNet/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/DCCRNet/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/DCCRNet/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/DCCRNet/run.sh b/egs/kinect-wsj/DCCRNet/run.sh new file mode 100644 index 000000000..9a7e62b32 --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/run.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES +out_dir=kinect_wsj # Controls the directory name associated to the evaluation results inside the experiment directory + +# Network config + +# Training config +epochs=200 +batch_size=12 +num_workers=4 +half_lr=yes +early_stop=yes +# Optim config +optimizer=adam +lr=0.001 +weight_decay=0. +# Data config +sample_rate=16000 +mode=max +n_src=2 + +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=${n_src}speakers/wav${sr_string}k/$mode +dumpdir=$data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi + +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=${uuid} +fi + +expdir=exp/train_dccrnet_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py --exp_dir $expdir \ + --epochs $epochs \ + --batch_size $batch_size \ + --num_workers $num_workers \ + --half_lr $half_lr \ + --early_stop $early_stop \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --sample_rate $sample_rate \ + --n_src $n_src | tee logs/train_${tag}.log + cp logs/train_${tag}.log $expdir/train.log + + # Get ready to publish + mkdir -p $expdir/publish_dir + echo "kinect_wsj/DCCRNet" >$expdir/publish_dir/recipe_name.txt +fi + +if [[ $stage -le 4 ]]; then + echo "Stage 4 : Evaluation" + + $python_path eval.py \ + --exp_dir $expdir \ + --test_dir $test_dir \ + --out_dir $out_dir \ + --use_gpu $eval_use_gpu | tee logs/eval_${tag}.log + + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/DCCRNet/train.py b/egs/kinect-wsj/DCCRNet/train.py new file mode 100644 index 000000000..7aeaa175d --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/train.py @@ -0,0 +1,124 @@ +import os +import argparse +import json + +import torch +from torch.optim.lr_scheduler import ReduceLROnPlateau +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.models import DCCRNet +from asteroid.engine.optimizers import make_optimizer +from asteroid.engine.system_kinect_wsj import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid.data.kinect_wsj import make_dataloaders + +# Keys which are not in the conf.yml file can be added here. +# In the hierarchical dictionary created when parsing, the key `key` can be +# found at dic['main_args'][key] + +# By default train.py will use all available GPUs. The `id` option in run.sh +# will limit the number of available GPUs for train.py . +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") + + +def main(conf): + conf["masknet"].update({"n_src": conf["data"]["n_src"]}) + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + model = DCCRNet(**conf["filterbank"], + **conf["masknet"], + sample_rate=conf["data"]["sample_rate"]) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + # Define scheduler + scheduler = None + if conf["training"]["half_lr"]: + scheduler = ReduceLROnPlateau(optimizer=optimizer, + factor=0.5, + patience=5) + # Just after instantiating, save the args. Easy loading in the future. + exp_dir = conf["main_args"]["exp_dir"] + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + + # Define Loss function. + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + system = System( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint(checkpoint_dir, + monitor="val_loss", + mode="min", + save_top_k=5, + verbose=True) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="val_loss", + mode="min", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "ddp" if torch.cuda.is_available() else None + + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=5.0, + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + + state_dict = torch.load(checkpoint.best_model_path) + system.load_state_dict(state_dict=state_dict["state_dict"]) + system.cpu() + + to_save = system.model.serialize() + #to_save.update(train_set.get_infos()) + torch.save(to_save, os.path.join(exp_dir, "best_model.pth")) + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/DCCRNet/utils/parse_options.sh b/egs/kinect-wsj/DCCRNet/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/DCCRNet/utils/prepare_python_env.sh b/egs/kinect-wsj/DCCRNet/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/DCCRNet/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file diff --git a/egs/kinect-wsj/DCUNet/README.md b/egs/kinect-wsj/DCUNet/README.md new file mode 100644 index 000000000..e17edd007 --- /dev/null +++ b/egs/kinect-wsj/DCUNet/README.md @@ -0,0 +1,2 @@ +# Results +Coming soon diff --git a/egs/kinect-wsj/DCUNet/eval.py b/egs/kinect-wsj/DCUNet/eval.py new file mode 100644 index 000000000..e4bb0fbc0 --- /dev/null +++ b/egs/kinect-wsj/DCUNet/eval.py @@ -0,0 +1,142 @@ +import os +import random +import soundfile as sf +import torch +import yaml +import json +import argparse +import pandas as pd +from tqdm import tqdm +from pprint import pprint + +from asteroid.metrics import get_metrics +from asteroid.data import KinectWsjMixDataset +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid import DCUNet +from asteroid.models import save_publishable +from asteroid.utils import tensors_to_device +from asteroid.dsp.normalization import normalize_estimates + +parser = argparse.ArgumentParser() +parser.add_argument("--test_dir", + type=str, + required=True, + help="Test directory including the csv files") +parser.add_argument( + "--out_dir", + type=str, + required=True, + help="Directory in exp_dir where the eval results" + " will be stored", +) +parser.add_argument("--use_gpu", + type=int, + default=0, + help="Whether to use the GPU for model execution") +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument("--n_save_ex", + type=int, + default=10, + help="Number of audio examples to save, -1 means all") + +compute_metrics = ["si_sdr", "sdr", "sir", "sar", "stoi"] + + +def main(conf): + model_path = os.path.join(conf["exp_dir"], "best_model.pth") + model = DCUNet.from_pretrained(model_path) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + test_set = KinectWsjMixDataset(conf["test_dir"], + n_src=conf["n_src"], + segment=None) # Uses all segment length + # Used to reorder sources only + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + + # Randomly choose the indexes of sentences to save. + eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"]) + ex_save_dir = os.path.join(eval_save_dir, "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources, noises = tensors_to_device(test_set[idx], + device=model_device) + mix = mix[..., 0] + sources = sources[..., 0] + #est_sources = model(mix.unsqueeze(0)) + est_sources = model.separate(mix[None, None]) + loss, reordered_sources = loss_func(est_sources, + sources[None], + return_est=True) + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = reordered_sources.squeeze(0).cpu().data.numpy() + # For each utterance, we get a dictionary with the mixture path, + # the input and output metrics + utt_metrics = get_metrics( + mix_np, + sources_np, + est_sources_np, + sample_rate=conf["sample_rate"], + metrics_list=compute_metrics, + ) + utt_metrics["mix_path"] = test_set.mix[idx][0] + est_sources_np_normalized = normalize_estimates(est_sources_np, mix_np) + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], + conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx), src, + conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np_normalized): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(eval_save_dir, "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in compute_metrics: + input_metric_name = "input_" + metric_name + ldf = all_metrics_df[metric_name] - all_metrics_df[input_metric_name] + final_results[metric_name] = all_metrics_df[metric_name].mean() + final_results[metric_name + "_imp"] = ldf.mean() + + print("Overall metrics :") + pprint(final_results) + + with open(os.path.join(eval_save_dir, "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + main(arg_dic) diff --git a/egs/kinect-wsj/DCUNet/local/conf.yml b/egs/kinect-wsj/DCUNet/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/DCUNet/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/DCUNet/local/convert_sphere2wav.sh b/egs/kinect-wsj/DCUNet/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/DCUNet/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/DCUNet/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/DCUNet/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/DCUNet/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/DCUNet/run.sh b/egs/kinect-wsj/DCUNet/run.sh new file mode 100644 index 000000000..0ad797b0d --- /dev/null +++ b/egs/kinect-wsj/DCUNet/run.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES +out_dir=kinect_wsj # Controls the directory name associated to the evaluation results inside the experiment directory + +# Network config + +# Training config +epochs=200 +batch_size=4 +num_workers=4 +half_lr=yes +early_stop=yes +# Optim config +optimizer=adam +lr=0.001 +weight_decay=0. +# Data config +sample_rate=16000 +mode=max +n_src=2 + +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=${n_src}speakers/wav${sr_string}k/$mode +dumpdir=$data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi + +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=${uuid} +fi + +expdir=exp/train_dcunet_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py --exp_dir $expdir \ + --epochs $epochs \ + --batch_size $batch_size \ + --num_workers $num_workers \ + --half_lr $half_lr \ + --early_stop $early_stop \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --sample_rate $sample_rate \ + --n_src $n_src | tee logs/train_${tag}.log + cp logs/train_${tag}.log $expdir/train.log + + # Get ready to publish + mkdir -p $expdir/publish_dir + echo "kinect_wsj/DCUNet" >$expdir/publish_dir/recipe_name.txt +fi + +if [[ $stage -le 4 ]]; then + echo "Stage 4: Evaluation" + + $python_path eval.py \ + --exp_dir $expdir \ + --test_dir $test_dir \ + --out_dir $out_dir \ + --use_gpu $eval_use_gpu | tee logs/eval_${tag}.log + + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/DCUNet/train.py b/egs/kinect-wsj/DCUNet/train.py new file mode 100644 index 000000000..30d559667 --- /dev/null +++ b/egs/kinect-wsj/DCUNet/train.py @@ -0,0 +1,124 @@ +import os +import argparse +import json + +import torch +from torch.optim.lr_scheduler import ReduceLROnPlateau +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.models import DCUNet +from asteroid.engine.optimizers import make_optimizer +from asteroid.engine.system_kinect_wsj import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid.data.kinect_wsj import make_dataloaders + +# Keys which are not in the conf.yml file can be added here. +# In the hierarchical dictionary created when parsing, the key `key` can be +# found at dic['main_args'][key] + +# By default train.py will use all available GPUs. The `id` option in run.sh +# will limit the number of available GPUs for train.py . +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") + + +def main(conf): + conf["masknet"].update({"n_src": conf["data"]["n_src"]}) + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + + model = DCUNet(**conf["filterbank"], + **conf["masknet"], + sample_rate=conf["data"]["sample_rate"]) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + # Define scheduler + scheduler = None + if conf["training"]["half_lr"]: + scheduler = ReduceLROnPlateau(optimizer=optimizer, + factor=0.5, + patience=5) + # Just after instantiating, save the args. Easy loading in the future. + exp_dir = conf["main_args"]["exp_dir"] + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + + # Define Loss function. + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + system = System( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint(checkpoint_dir, + monitor="val_loss", + mode="min", + save_top_k=5, + verbose=True) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="val_loss", + mode="min", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "ddp" if torch.cuda.is_available() else None + + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=5.0, + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + + state_dict = torch.load(checkpoint.best_model_path) + system.load_state_dict(state_dict=state_dict["state_dict"]) + system.cpu() + + to_save = system.model.serialize() + torch.save(to_save, os.path.join(exp_dir, "best_model.pth")) + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/DCUNet/utils/parse_options.sh b/egs/kinect-wsj/DCUNet/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/DCUNet/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/DCUNet/utils/prepare_python_env.sh b/egs/kinect-wsj/DCUNet/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/DCUNet/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file diff --git a/egs/kinect-wsj/DPRNNTasNet/.DS_Store b/egs/kinect-wsj/DPRNNTasNet/.DS_Store new file mode 100644 index 000000000..b315e05a6 Binary files /dev/null and b/egs/kinect-wsj/DPRNNTasNet/.DS_Store differ diff --git a/egs/kinect-wsj/DPRNNTasNet/README.md b/egs/kinect-wsj/DPRNNTasNet/README.md new file mode 100644 index 000000000..e17edd007 --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/README.md @@ -0,0 +1,2 @@ +# Results +Coming soon diff --git a/egs/kinect-wsj/DPRNNTasNet/eval.py b/egs/kinect-wsj/DPRNNTasNet/eval.py new file mode 100644 index 000000000..b44c27eb8 --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/eval.py @@ -0,0 +1,143 @@ +import os +import random +import soundfile as sf +import torch +import yaml +import json +import argparse +import pandas as pd +from tqdm import tqdm +from pprint import pprint + +from asteroid.metrics import get_metrics +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid import DPRNNTasNet +from asteroid.utils import tensors_to_device +from asteroid.dsp.normalization import normalize_estimates +from asteroid.data import KinectWsjMixDataset + +parser = argparse.ArgumentParser() +parser.add_argument("--test_dir", + type=str, + required=True, + help="Test directory including the csv files") +parser.add_argument("--n_src", type=int, default=2) +parser.add_argument( + "--out_dir", + type=str, + required=True, + help="Directory in exp_dir where the eval results" + " will be stored", +) +parser.add_argument("--use_gpu", + type=int, + default=0, + help="Whether to use the GPU for model execution") +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument("--n_save_ex", + type=int, + default=10, + help="Number of audio examples to save, -1 means all") + +compute_metrics = ["si_sdr", "sdr", "sir", "sar", "stoi"] + + +def main(conf): + model_path = os.path.join(conf["exp_dir"], "best_model.pth") + model = DPRNNTasNet.from_pretrained(model_path) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + test_set = KinectWsjMixDataset(conf["test_dir"], + n_src=conf["n_src"], + segment=None) # Uses all segment length + # Used to reorder sources only + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + + # Randomly choose the indexes of sentences to save. + eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"]) + ex_save_dir = os.path.join(eval_save_dir, "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources, noises = tensors_to_device(test_set[idx], + device=model_device) + mix = mix[..., 0] + sources = sources[..., 0] + #est_sources = model(mix.unsqueeze(0)) + est_sources = model.separate(mix[None, None]) + loss, reordered_sources = loss_func(est_sources, + sources[None], + return_est=True) + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = reordered_sources.squeeze(0).cpu().data.numpy() + # For each utterance, we get a dictionary with the mixture path, + # the input and output metrics + utt_metrics = get_metrics( + mix_np, + sources_np, + est_sources_np, + sample_rate=conf["sample_rate"], + metrics_list=compute_metrics, + ) + utt_metrics["mix_path"] = test_set.mix[idx][0] + est_sources_np_normalized = normalize_estimates(est_sources_np, mix_np) + + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], + conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx), src, + conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np_normalized): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(eval_save_dir, "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in compute_metrics: + input_metric_name = "input_" + metric_name + ldf = all_metrics_df[metric_name] - all_metrics_df[input_metric_name] + final_results[metric_name] = all_metrics_df[metric_name].mean() + final_results[metric_name + "_imp"] = ldf.mean() + + print("Overall metrics :") + pprint(final_results) + + with open(os.path.join(eval_save_dir, "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + main(arg_dic) diff --git a/egs/kinect-wsj/DPRNNTasNet/local/conf.yml b/egs/kinect-wsj/DPRNNTasNet/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/DPRNNTasNet/local/convert_sphere2wav.sh b/egs/kinect-wsj/DPRNNTasNet/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/DPRNNTasNet/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/DPRNNTasNet/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/DPRNNTasNet/run.sh b/egs/kinect-wsj/DPRNNTasNet/run.sh new file mode 100644 index 000000000..e79e590a3 --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/run.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES +out_dir=kinect_wsj # Controls the directory name associated to the evaluation results inside the experiment directory + +# Network config + +# Training config +epochs=200 +batch_size=64 +num_workers=8 +half_lr=yes +early_stop=yes +# Optim config +optimizer=rmsprop +lr=0.0001 +weight_decay=0. +# Data config +sample_rate=16000 +mode=max +n_src=2 +#segment=1 +#task=enh_single # one of 'enh_single', 'enh_both', 'sep_clean', 'sep_noisy' + +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=${n_src}speakers/wav${sr_string}k/$mode +dumpdir=$data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi + +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=${uuid} +fi + +expdir=exp/train_dprnntasnet_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py --exp_dir $expdir \ + --epochs $epochs \ + --batch_size $batch_size \ + --num_workers $num_workers \ + --half_lr $half_lr \ + --early_stop $early_stop \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --sample_rate $sample_rate \ + --n_src $n_src | tee logs/train_${tag}.log + cp logs/train_${tag}.log $expdir/train.log + + # Get ready to publish + mkdir -p $expdir/publish_dir + echo "kinect_wsj/DPRNNTasNet" >$expdir/publish_dir/recipe_name.txt +fi + +if [[ $stage -le 4 ]]; then + echo "Stage 4 : Evaluation" + + $python_path eval.py \ + --exp_dir $expdir \ + --test_dir $test_dir \ + --out_dir $out_dir \ + --use_gpu $eval_use_gpu | tee logs/eval_${tag}.log + + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/DPRNNTasNet/train.py b/egs/kinect-wsj/DPRNNTasNet/train.py new file mode 100644 index 000000000..a4b6ebcdb --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/train.py @@ -0,0 +1,124 @@ +import os +import argparse +import json + +import torch +from torch.optim.lr_scheduler import ReduceLROnPlateau +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.models import DPRNNTasNet +from asteroid.engine.optimizers import make_optimizer +from asteroid.engine.system_kinect_wsj import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid.data.kinect_wsj import make_dataloaders + +# Keys which are not in the conf.yml file can be added here. +# In the hierarchical dictionary created when parsing, the key `key` can be +# found at dic['main_args'][key] + +# By default train.py will use all available GPUs. The `id` option in run.sh +# will limit the number of available GPUs for train.py . +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") + + +def main(conf): + conf["masknet"].update({"n_src": conf["data"]["n_src"]}) + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + model = DPRNNTasNet(**conf["filterbank"], + **conf["masknet"], + sample_rate=conf["data"]["sample_rate"]) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + # Define scheduler + scheduler = None + if conf["training"]["half_lr"]: + scheduler = ReduceLROnPlateau(optimizer=optimizer, + factor=0.5, + patience=5) + # Just after instantiating, save the args. Easy loading in the future. + exp_dir = conf["main_args"]["exp_dir"] + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + + # Define Loss function. + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + system = System( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint(checkpoint_dir, + monitor="val_loss", + mode="min", + save_top_k=5, + verbose=True) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="val_loss", + mode="min", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "ddp" if torch.cuda.is_available() else None + + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=5.0, + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + + state_dict = torch.load(checkpoint.best_model_path) + system.load_state_dict(state_dict=state_dict["state_dict"]) + system.cpu() + + to_save = system.model.serialize() + #to_save.update(train_set.get_infos()) + torch.save(to_save, os.path.join(exp_dir, "best_model.pth")) + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/DPRNNTasNet/utils/parse_options.sh b/egs/kinect-wsj/DPRNNTasNet/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/DPRNNTasNet/utils/prepare_python_env.sh b/egs/kinect-wsj/DPRNNTasNet/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/DPRNNTasNet/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file diff --git a/egs/kinect-wsj/DPTNet/.DS_Store b/egs/kinect-wsj/DPTNet/.DS_Store new file mode 100644 index 000000000..da30f0f34 Binary files /dev/null and b/egs/kinect-wsj/DPTNet/.DS_Store differ diff --git a/egs/kinect-wsj/DPTNet/README.md b/egs/kinect-wsj/DPTNet/README.md new file mode 100644 index 000000000..e17edd007 --- /dev/null +++ b/egs/kinect-wsj/DPTNet/README.md @@ -0,0 +1,2 @@ +# Results +Coming soon diff --git a/egs/kinect-wsj/DPTNet/eval.py b/egs/kinect-wsj/DPTNet/eval.py new file mode 100644 index 000000000..0a1a923f8 --- /dev/null +++ b/egs/kinect-wsj/DPTNet/eval.py @@ -0,0 +1,143 @@ +import os +import random +import soundfile as sf +import torch +import yaml +import json +import argparse +import pandas as pd +from tqdm import tqdm +from pprint import pprint + +from asteroid.metrics import get_metrics +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid import DPTNet +from asteroid.utils import tensors_to_device +from asteroid.dsp.normalization import normalize_estimates +from asteroid.data import KinectWsjMixDataset + +parser = argparse.ArgumentParser() +parser.add_argument("--test_dir", + type=str, + required=True, + help="Test directory including the csv files") +parser.add_argument("--n_src", type=int, default=2) +parser.add_argument( + "--out_dir", + type=str, + required=True, + help="Directory in exp_dir where the eval results" + " will be stored", +) +parser.add_argument("--use_gpu", + type=int, + default=0, + help="Whether to use the GPU for model execution") +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument("--n_save_ex", + type=int, + default=10, + help="Number of audio examples to save, -1 means all") + +compute_metrics = ["si_sdr", "sdr", "sir", "sar", "stoi"] + + +def main(conf): + model_path = os.path.join(conf["exp_dir"], "best_model.pth") + model = DPTNet.from_pretrained(model_path) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + test_set = KinectWsjMixDataset(conf["test_dir"], + n_src=conf["n_src"], + segment=None) # Uses all segment length + # Used to reorder sources only + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + + # Randomly choose the indexes of sentences to save. + eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"]) + ex_save_dir = os.path.join(eval_save_dir, "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources, noises = tensors_to_device(test_set[idx], + device=model_device) + mix = mix[..., 0] + sources = sources[..., 0] + #est_sources = model(mix.unsqueeze(0)) + est_sources = model.separate(mix[None, None]) + loss, reordered_sources = loss_func(est_sources, + sources[None], + return_est=True) + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = reordered_sources.squeeze(0).cpu().data.numpy() + # For each utterance, we get a dictionary with the mixture path, + # the input and output metrics + utt_metrics = get_metrics( + mix_np, + sources_np, + est_sources_np, + sample_rate=conf["sample_rate"], + metrics_list=compute_metrics, + ) + utt_metrics["mix_path"] = test_set.mix[idx][0] + est_sources_np_normalized = normalize_estimates(est_sources_np, mix_np) + + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], + conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx), src, + conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np_normalized): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(eval_save_dir, "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in compute_metrics: + input_metric_name = "input_" + metric_name + ldf = all_metrics_df[metric_name] - all_metrics_df[input_metric_name] + final_results[metric_name] = all_metrics_df[metric_name].mean() + final_results[metric_name + "_imp"] = ldf.mean() + + print("Overall metrics :") + pprint(final_results) + + with open(os.path.join(eval_save_dir, "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + main(arg_dic) diff --git a/egs/kinect-wsj/DPTNet/local/conf.yml b/egs/kinect-wsj/DPTNet/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/DPTNet/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/DPTNet/local/convert_sphere2wav.sh b/egs/kinect-wsj/DPTNet/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/DPTNet/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/DPTNet/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/DPTNet/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/DPTNet/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/DPTNet/run.sh b/egs/kinect-wsj/DPTNet/run.sh new file mode 100644 index 000000000..b420b77d2 --- /dev/null +++ b/egs/kinect-wsj/DPTNet/run.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES +out_dir=kinect_wsj # Controls the directory name associated to the evaluation results inside the experiment directory + +# Network config + +# Training config +epochs=200 +batch_size=12 +num_workers=4 +half_lr=yes +early_stop=yes +# Optim config +optimizer=adam +lr=0.001 +weight_decay=0. +# Data config +sample_rate=16000 +mode=max +n_src=2 + +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=${n_src}speakers/wav${sr_string}k/$mode +dumpdir=$data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=${uuid} +fi + +expdir=exp/train_dptnet_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py --exp_dir $expdir \ + --epochs $epochs \ + --batch_size $batch_size \ + --num_workers $num_workers \ + --half_lr $half_lr \ + --early_stop $early_stop \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --sample_rate $sample_rate \ + --n_src $n_src | tee logs/train_${tag}.log + cp logs/train_${tag}.log $expdir/train.log + + # Get ready to publish + mkdir -p $expdir/publish_dir + echo "kinect_wsj/DPTNet" >$expdir/publish_dir/recipe_name.txt +fi + +if [[ $stage -le 4 ]]; then + echo "Stage 4: Evaluation" + + $python_path eval.py \ + --exp_dir $expdir \ + --test_dir $test_dir \ + --out_dir $out_dir \ + --use_gpu $eval_use_gpu | tee logs/eval_${tag}.log + + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/DPTNet/train.py b/egs/kinect-wsj/DPTNet/train.py new file mode 100644 index 000000000..8d4753dff --- /dev/null +++ b/egs/kinect-wsj/DPTNet/train.py @@ -0,0 +1,124 @@ +import os +import argparse +import json + +import torch +from torch.optim.lr_scheduler import ReduceLROnPlateau +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.models import DPTNet +from asteroid.engine.optimizers import make_optimizer +from asteroid.engine.system_kinect_wsj import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid.data.kinect_wsj import make_dataloaders + +# Keys which are not in the conf.yml file can be added here. +# In the hierarchical dictionary created when parsing, the key `key` can be +# found at dic['main_args'][key] + +# By default train.py will use all available GPUs. The `id` option in run.sh +# will limit the number of available GPUs for train.py . +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") + + +def main(conf): + conf["masknet"].update({"n_src": conf["data"]["n_src"]}) + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + model = DPTNet(**conf["filterbank"], + **conf["masknet"], + sample_rate=conf["data"]["sample_rate"]) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + # Define scheduler + scheduler = None + if conf["training"]["half_lr"]: + scheduler = ReduceLROnPlateau(optimizer=optimizer, + factor=0.5, + patience=5) + # Just after instantiating, save the args. Easy loading in the future. + exp_dir = conf["main_args"]["exp_dir"] + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + + # Define Loss function. + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + system = System( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint(checkpoint_dir, + monitor="val_loss", + mode="min", + save_top_k=5, + verbose=True) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="val_loss", + mode="min", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "ddp" if torch.cuda.is_available() else None + + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=5.0, + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + + state_dict = torch.load(checkpoint.best_model_path) + system.load_state_dict(state_dict=state_dict["state_dict"]) + system.cpu() + + to_save = system.model.serialize() + #to_save.update(train_set.get_infos()) + torch.save(to_save, os.path.join(exp_dir, "best_model.pth")) + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/DPTNet/utils/parse_options.sh b/egs/kinect-wsj/DPTNet/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/DPTNet/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/DPTNet/utils/prepare_python_env.sh b/egs/kinect-wsj/DPTNet/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/DPTNet/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/.DS_Store b/egs/kinect-wsj/Multi-Decoder-DPRNN/.DS_Store new file mode 100644 index 000000000..bfe85bc65 Binary files /dev/null and b/egs/kinect-wsj/Multi-Decoder-DPRNN/.DS_Store differ diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/eval.py b/egs/kinect-wsj/Multi-Decoder-DPRNN/eval.py new file mode 100644 index 000000000..fd8d93f43 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/eval.py @@ -0,0 +1,138 @@ +""" +Author: Joseph(Junzhe) Zhu, 2021/5. Email: josefzhu@stanford.edu / junzhe.joseph.zhu@gmail.com +For the original code for the paper[1], please refer to https://github.com/JunzheJosephZhu/MultiDecoder-DPRNN +Demo Page: https://junzhejosephzhu.github.io/Multi-Decoder-DPRNN/ +Multi-Decoder DPRNN is a method for source separation when the number of speakers is unknown. +Our contribution is using multiple output heads, with each head modelling a distinct number of source outputs. +In addition, we design a selector network which determines which output head to use, i.e. estimates the number of sources. +The "DPRNN" part of the architecture is orthogonal to our contribution, and can be replaced with any other separator, e.g. Conv/LSTM-TasNet. +References: + [1] "Multi-Decoder DPRNN: High Accuracy Source Counting and Separation", + Junzhe Zhu, Raymond Yeh, Mark Hasegawa-Johnson. https://arxiv.org/abs/2011.12022 +""" +from metrics import Penalized_PIT_Wrapper, pairwise_neg_sisdr_loss +import os +import json +import yaml +import argparse +import random +import torch +from tqdm import tqdm +import pandas as pd +import soundfile as sf +from pprint import pprint + +from asteroid.utils import tensors_to_device +from asteroid.metrics import get_metrics + +from model import load_best_model, make_model_and_optimizer +from asteroid.data import KinectWsjMixDataset + + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--test_dir", type=str, required=True, help="Test directory including the json files" +) +parser.add_argument( + "--use_gpu", type=int, default=0, help="Whether to use the GPU for model execution" +) +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument( + "--n_save_ex", type=int, default=50, help="Number of audio examples to save, -1 means all" +) + + +def main(conf): + best_model_path = os.path.join(conf["exp_dir"], "best_model.pth") + if not os.path.exists(best_model_path): + # make pth from checkpoint + model = load_best_model( + conf["train_conf"], conf["exp_dir"], sample_rate=conf["sample_rate"] + ) + torch.save(model.state_dict(), best_model_path) + else: + model, _ = make_model_and_optimizer(conf["train_conf"], sample_rate=conf["sample_rate"]) + model.eval() + model.load_state_dict(torch.load(best_model_path)) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + + test_set = KinectWsjMixDataset(conf["test_dir"], n_src=conf["n_src"], segment=None) + + # Randomly choose the indexes of sentences to save. + ex_save_dir = os.path.join(conf["exp_dir"], "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources = [ + torch.Tensor(x) for x in tensors_to_device(test_set[idx], device=model_device) + ] + est_sources = model.separate(mix[None]) + p_si_snr = Penalized_PIT_Wrapper(pairwise_neg_sisdr_loss)(est_sources, sources) + utt_metrics = { + "P-Si-SNR": p_si_snr.item(), + "counting_accuracy": float(sources.size(0) == est_sources.size(0)), + } + utt_metrics["mix_path"] = test_set.data[idx][0] + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = est_sources.cpu().data.numpy() + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx + 1), src, conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx + 1), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(conf["exp_dir"], "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in ["P-Si-SNR", "counting_accuracy"]: + final_results[metric_name] = all_metrics_df[metric_name].mean() + print("Overall metrics :") + pprint(final_results) + with open(os.path.join(conf["exp_dir"], "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + if args.task != arg_dic["train_conf"]["data"]["task"]: + print( + "Warning : the task used to test is different than " + "the one from training, be sure this is what you want." + ) + + main(arg_dic) diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/kinect_wsj_var.py b/egs/kinect-wsj/Multi-Decoder-DPRNN/kinect_wsj_var.py new file mode 100644 index 000000000..febb57bc6 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/kinect_wsj_var.py @@ -0,0 +1,171 @@ +import torch +from torch.utils import data +import os +import numpy as np +import soundfile as sf +from asteroid.data.wsj0_mix import Wsj0mixDataset + + +def make_dataloaders( + train_dir, + valid_dir, + n_src=2, + sample_rate=16000, + segment=4.0, + batch_size=4, + num_workers=None, + **kwargs, +): + num_workers = num_workers if num_workers else batch_size + train_set = KinectWsjMixDataset(train_dir, + n_src=n_src, + sample_rate=sample_rate, + segment=segment) + val_set = KinectWsjMixDataset(valid_dir, + n_src=n_src, + sample_rate=sample_rate, + segment=segment) + train_loader = data.DataLoader( + train_set, + shuffle=True, + batch_size=batch_size, + num_workers=num_workers, + drop_last=True, + collate_fn=_collate_fn, + ) + val_loader = data.DataLoader( + val_set, + shuffle=True, + batch_size=batch_size, + num_workers=num_workers, + drop_last=True, + collate_fn=_collate_fn, + ) + return train_loader, val_loader + + +class KinectWsjMixDataset(Wsj0mixDataset): + """Dataset class for the KinectWSJ-mix source separation dataset. + + Args: + json_dir (str): The path to the directory containing the json files. + sample_rate (int, optional): The sampling rate of the wav files. + segment (float, optional): Length of the segments used for training, + in seconds. If None, use full utterances (e.g. for test). + n_src (int, optional): Number of sources in the training targets. + + References + "Analyzing the impact of speaker localization errors on speech separation + for automatic speech recognition", Sunit Sivasankaran et al. 2020. + """ + + dataset_name = "Kinect-WSJ" + + def __init__(self, json_dir, n_src=2, sample_rate=16000, segment=4.0): + super().__init__(json_dir, + n_src=n_src, + sample_rate=sample_rate, + segment=segment) + noises = [] + for i in range(len(self.mix)): + path = self.mix[i][0] + # Warning: linux specific + path_splits = path.split("/") + path_splits[-2] = "noise" + noise_path = "/" + os.path.join(*path_splits) + noises.append([noise_path, self.mix[i][1]]) + self.noises = noises + + def __getitem__(self, idx): + """Gets a mixture/sources pair. + Returns: + mixture, stack([source_arrays]), noise + mixture is of dimension [samples, channels] + sources are of dimension [n_src, samples, channels] + """ + # Random start + if self.mix[idx][1] == self.seg_len or self.like_test: + rand_start = 0 + else: + rand_start = np.random.randint(0, self.mix[idx][1] - self.seg_len) + if self.like_test: + stop = None + else: + stop = rand_start + self.seg_len + # Load mixture + x, _ = sf.read(self.mix[idx][0], + start=rand_start, + stop=stop, + dtype="float32", + always_2d=True) + noise, _ = sf.read(self.noises[idx][0], + start=rand_start, + stop=stop, + dtype="float32", + always_2d=True) + # Load sources + source_arrays = [] + for src in self.sources: + if src[idx] is None: + # Target is filled with zeros if n_src > default_nsrc + s = np.zeros_like(x) + else: + s, _ = sf.read(src[idx][0], + start=rand_start, + stop=stop, + dtype="float32", + always_2d=True) + source_arrays.append(s) + sources = torch.from_numpy(np.stack(source_arrays)) + mixture = torch.from_numpy(x)[..., 0] + sources = sources[..., 0] + + return mixture, sources + + def get_infos(self): + """Get dataset infos (for publishing models). + + Returns: + dict, dataset infos with keys `dataset`, `task` and `licences`. + """ + infos = super().get_infos() + infos["licenses"].append(chime5_license) + return infos + + +def _collate_fn(batch): + """ + Args: + batch: list, len(batch) = batch_size, each entry is a tuple of (mixture, sources) + Returns: + mixtures_tensor: B x T, torch.Tensor, padded mixtures + source_tensor: B x C x T, torch.Tensor, padded in both channel and time dimension + ilens : B, torch.Tensor, length of each mixture + num_sources : B, torch.Tensor, number of sources for each mixture + """ + ilens = [len(mixture) for mixture, _ in batch] + num_sources = [len(sources) for _, sources in batch] + mixture_tensor = torch.zeros(len(batch), max(ilens)) + source_tensor = torch.zeros(len(batch), max(num_sources), max(ilens)) + + for i, (mixture, sources) in enumerate(batch): # compute length to pad to + assert len(mixture) == len(sources[0]) + mixture_tensor[i, :ilens[i]] = torch.Tensor(mixture).float() + source_tensor[i, :num_sources[i], :ilens[i]] = torch.Tensor( + np.stack(sources, axis=0)).float() + ilens = torch.Tensor(np.stack(ilens)).int() + num_sources = torch.Tensor(np.stack(num_sources)).int() + + return mixture_tensor, source_tensor, ilens, num_sources + + +chime5_license = dict( + title="The CHiME-5 speech corpus", + title_link="http://spandh.dcs.shef.ac.uk/chime_challenge/CHiME5/index.html", + author="Jon Barker, Shinji Watanabe and Emmanuel Vincent", + author_link= + "http://spandh.dcs.shef.ac.uk/chime_challenge/chime2018/contact.html", + license="CHiME-5 data licence - non-commercial 1.00", + license_link="https://licensing.sheffield.ac.uk/i/data/chime5.html", + non_commercial=True, +) diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/local/conf.yml b/egs/kinect-wsj/Multi-Decoder-DPRNN/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/local/convert_sphere2wav.sh b/egs/kinect-wsj/Multi-Decoder-DPRNN/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/Multi-Decoder-DPRNN/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/metrics.py b/egs/kinect-wsj/Multi-Decoder-DPRNN/metrics.py new file mode 100644 index 000000000..9e90aa3af --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/metrics.py @@ -0,0 +1,90 @@ +import torch +import torch.nn as nn +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from torch.nn.modules.loss import _Loss +from scipy.optimize import linear_sum_assignment + + +class PairwiseNegSDR_Loss(_Loss): + """ + Same as asteroid.losses.PairwiseNegSDR, but supports speaker number mismatch + """ + + def __init__(self, sdr_type, zero_mean=True, take_log=True, EPS=1e-8): + super(PairwiseNegSDR_Loss, self).__init__() + assert sdr_type in ["snr", "sisdr", "sdsdr"] + self.sdr_type = sdr_type + self.zero_mean = zero_mean + self.take_log = take_log + self.EPS = EPS + + def forward(self, est_targets, targets): + # Step 1. Zero-mean norm + if self.zero_mean: + mean_source = torch.mean(targets, dim=2, keepdim=True) + mean_estimate = torch.mean(est_targets, dim=2, keepdim=True) + targets = targets - mean_source + est_targets = est_targets - mean_estimate + # Step 2. Pair-wise SI-SDR. (Reshape to use broadcast) + s_target = torch.unsqueeze(targets, dim=1) + s_estimate = torch.unsqueeze(est_targets, dim=2) + + if self.sdr_type in ["sisdr", "sdsdr"]: + # [batch, n_src, n_src, 1] + pair_wise_dot = torch.sum(s_estimate * s_target, dim=3, keepdim=True) + # [batch, 1, n_src, 1] + s_target_energy = torch.sum(s_target ** 2, dim=3, keepdim=True) + self.EPS + # [batch, n_src, n_src, time] + pair_wise_proj = pair_wise_dot * s_target / s_target_energy + else: + # [batch, n_src, n_src, time] + pair_wise_proj = s_target.repeat(1, s_target.shape[2], 1, 1) + if self.sdr_type in ["sdsdr", "snr"]: + e_noise = s_estimate - s_target + else: + e_noise = s_estimate - pair_wise_proj + # [batch, n_src, n_src] + pair_wise_sdr = torch.sum(pair_wise_proj ** 2, dim=3) / ( + torch.sum(e_noise ** 2, dim=3) + self.EPS + ) + if self.take_log: + pair_wise_sdr = 10 * torch.log10(pair_wise_sdr + self.EPS) + return -pair_wise_sdr + + +class Penalized_PIT_Wrapper(nn.Module): + """ + Implementation of P-Si-SNR, as purposed in [1] + References: + [1] "Multi-Decoder DPRNN: High Accuracy Source Counting and Separation", + Junzhe Zhu, Raymond Yeh, Mark Hasegawa-Johnson. https://arxiv.org/abs/2011.12022 + """ + + def __init__(self, loss_func, penalty=30, perm_reduce=None): + super().__init__() + assert penalty > 0, "penalty term should be positive" + self.neg_penalty = -penalty + self.perm_reduce = perm_reduce + self.loss_func = loss_func + + def forward(self, est_targets, targets, **kwargs): + """ + est_targets: torch.Tensor, $(est_nsrc, ...)$ + targets: torch.Tensor, $(gt_nsrc, ...)$ + """ + est_nsrc, T = est_targets.size() + gt_nsrc = est_targets.size(0) + pw_losses = self.loss_func(est_targets.unsqueeze(0), targets.unsqueeze(0)).squeeze(0) + # After transposition, dim 1 corresp. to sources and dim 2 to estimates + pwl = pw_losses.transpose(-1, -2) + # Loop over batch + row indices are always ordered for square matrices. + row, col = [torch.Tensor(x).long() for x in linear_sum_assignment(pwl.detach().cpu())] + avg_neg_sdr = pwl[row, col].mean() + p_si_snr = ( + -avg_neg_sdr * min(est_nsrc, gt_nsrc) + self.neg_penalty * abs(est_nsrc - gt_nsrc) + ) / max(est_nsrc, gt_nsrc) + return p_si_snr + + +# alias +pairwise_neg_sisdr_loss = PairwiseNegSDR_Loss("sisdr") diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/model.py b/egs/kinect-wsj/Multi-Decoder-DPRNN/model.py new file mode 100644 index 000000000..ae30e99b2 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/model.py @@ -0,0 +1,509 @@ +""" +Author: Joseph(Junzhe) Zhu, 2021/5. Email: josefzhu@stanford.edu / junzhe.joseph.zhu@gmail.com +For the original code for the paper[1], please refer to https://github.com/JunzheJosephZhu/MultiDecoder-DPRNN +Demo Page: https://junzhejosephzhu.github.io/Multi-Decoder-DPRNN/ +Multi-Decoder DPRNN is a method for source separation when the number of speakers is unknown. +Our contribution is using multiple output heads, with each head modelling a distinct number of source outputs. +In addition, we design a selector network which determines which output head to use, i.e. estimates the number of sources. +The "DPRNN" part of the architecture is orthogonal to our contribution, and can be replaced with any other separator, e.g. Conv/LSTM-TasNet. +References: + [1] "Multi-Decoder DPRNN: High Accuracy Source Counting and Separation", + Junzhe Zhu, Raymond Yeh, Mark Hasegawa-Johnson. https://arxiv.org/abs/2011.12022 +""" +import json +import os +import numpy as np +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn.functional import fold, unfold + +from asteroid import torch_utils +from asteroid.models import BaseModel +from asteroid_filterbanks import make_enc_dec +from asteroid.engine.optimizers import make_optimizer +from asteroid.masknn import activations, norms +from asteroid.masknn.recurrent import DPRNNBlock +from asteroid.models.base_models import _shape_reconstructed, _unsqueeze_to_3d +from asteroid.utils.generic_utils import has_arg +from asteroid.utils.torch_utils import pad_x_to_y, script_if_tracing, jitable_shape +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr + + +def make_model_and_optimizer(conf, sample_rate): + """Function to define the model and optimizer for a config dictionary. + Args: + conf: Dictionary containing the output of hierachical argparse. + Returns: + model, optimizer. + The main goal of this function is to make reloading for resuming + and evaluation very simple. + """ + model = MultiDecoderDPRNN(**conf["masknet"], **conf["filterbank"], sample_rate=sample_rate) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + return model, optimizer + + +class MultiDecoderDPRNN(BaseModel): + """Multi-Decoder Dual-Path RNN as proposed in [1]. + + Args: + n_srcs (list of int): range of possible number of sources + bn_chan (int): Number of channels after the bottleneck. + Defaults to 128. + hid_size (int): Number of neurons in the RNNs cell state. + Defaults to 128. + chunk_size (int): window size of overlap and add processing. + Defaults to 100. + hop_size (int or None): hop size (stride) of overlap and add processing. + Default to `chunk_size // 2` (50% overlap). + n_repeats (int): Number of repeats. Defaults to 6. + norm_type (str, optional): Type of normalization to use. To choose from + - ``'gLN'``: global Layernorm + - ``'cLN'``: channelwise Layernorm + mask_act (str, optional): Which non-linear function to generate mask. + bidirectional (bool, optional): True for bidirectional Inter-Chunk RNN + (Intra-Chunk is always bidirectional). + rnn_type (str, optional): Type of RNN used. Choose between ``'RNN'``, + ``'LSTM'`` and ``'GRU'``. + num_layers (int, optional): Number of layers in each RNN. + dropout (float, optional): Dropout ratio, must be in [0,1]. + kernel_size (int): Length of the filters. + n_filters (int): Number of filters / Input dimension of the masker net. + stride (int, optional): Stride of the convolution. + If None (default), set to ``kernel_size // 2``. + + References + [1] "Multi-Decoder DPRNN: High Accuracy Source Counting and Separation", + Junzhe Zhu, Raymond Yeh, Mark Hasegawa-Johnson. https://arxiv.org/abs/2011.12022 + """ + + def __init__( + self, + n_srcs, + bn_chan=128, + hid_size=128, + chunk_size=100, + hop_size=None, + n_repeats=6, + norm_type="gLN", + mask_act="sigmoid", + bidirectional=True, + rnn_type="LSTM", + num_layers=1, + dropout=0, + kernel_size=16, + n_filters=64, + stride=8, + encoder_activation=None, + use_mulcat=False, + sample_rate=8000, + ): + super().__init__(sample_rate=sample_rate) + self.encoder_activation = encoder_activation + self.enc_activation = activations.get(encoder_activation or "linear")() + hop_size = hop_size if hop_size is not None else chunk_size // 2 + self.encoder, _ = make_enc_dec( + "free", + kernel_size=kernel_size, + n_filters=n_filters, + stride=stride, + ) + # Update in_chan + self.masker = DPRNN_MultiStage( + in_chan=n_filters, + bn_chan=bn_chan, + hid_size=hid_size, + chunk_size=chunk_size, + hop_size=hop_size, + n_repeats=n_repeats, + norm_type=norm_type, + bidirectional=bidirectional, + rnn_type=rnn_type, + use_mulcat=use_mulcat, + num_layers=num_layers, + dropout=dropout, + ) + self.decoder_select = Decoder_Select( + kernel_size=kernel_size, + stride=stride, + in_chan=n_filters, + n_srcs=n_srcs, + bn_chan=bn_chan, + chunk_size=chunk_size, + hop_size=hop_size, + mask_act=mask_act, + ) + + """ + Args: + wav: 2D or 3D Tensor, Tensor of shape $(batch, T)$ + ground_truth: oracle number of speakers, None or list of $(batch)$ ints + Return: + reconstructed: torch.Tensor, $(batch, num_stages, max_spks, T)$ + where max_spks is the maximum possible number of speakers. + if training, num_stages=n_repeats; otherwise num_stages=0 + Speaker dimension is zero-padded for examples with num_spks < max_spks + """ + + def forward(self, wav, ground_truth=None): + shape = jitable_shape(wav) + # [batch, 1, T] + wav = _unsqueeze_to_3d(wav) + tf_rep = self.enc_activation(self.encoder(wav)) + est_masks_list = self.masker(tf_rep) + decoded, selector_output = self.decoder_select( + est_masks_list, tf_rep, ground_truth=ground_truth + ) + reconstructed = pad_x_to_y(decoded, wav) + return _shape_reconstructed(reconstructed, shape), _shape_reconstructed( + selector_output, shape + ) + + def forward_wav(self, wav, slice_size=32000, *args, **kwargs): + """Separation method for waveforms. + Unfolds a full audio into slices, estimate + Args: + wav (torch.Tensor): waveform array/tensor. + Shape: 1D, 2D or 3D tensor, time last. + Return: + output_cat (torch.Tensor): concatenated output tensor. + [num_spks, T] + """ + assert not self.training, "forward_wav is only used for test mode" + T = wav.size(-1) + if wav.ndim == 1: + wav = wav.reshape(1, wav.size(0)) + assert wav.ndim == 2 # [1, T] + slice_stride = slice_size // 2 + # pad wav to integer multiple of slice_stride + T_padded = max(int(np.ceil(T / slice_stride)), 2) * slice_stride + wav = F.pad(wav, (0, T_padded - T)) + slices = wav.unfold( + dimension=-1, size=slice_size, step=slice_stride + ) # [1, slice_nb, slice_size] + slice_nb = slices.size(1) + slices = slices.squeeze(0).unsqueeze(1) + tf_rep = self.enc_activation(self.encoder(slices)) + est_masks_list = self.masker(tf_rep) + selector_input = est_masks_list[-1] # [slice_nb, bn_chan, chunk_size, n_chunks] + selector_output = self.decoder_select.selector(selector_input).reshape( + slice_nb, -1 + ) # [slice_nb, num_decs] + est_idx, _ = selector_output.argmax(-1).mode() + est_spks = self.decoder_select.n_srcs[est_idx] + output_wavs, _ = self.decoder_select( + est_masks_list, tf_rep, ground_truth=[est_spks] * slice_nb + ) # [slice_nb, 1, n_spks, slice_size] + output_wavs = output_wavs.squeeze(1)[:, :est_spks, :] + # TODO: overlap and add (with division) + output_cat = output_wavs.new_zeros(est_spks, slice_nb * slice_size) + output_cat[:, :slice_size] = output_wavs[0] + start = slice_stride + for i in range(1, slice_nb): + end = start + slice_size + overlap_prev = output_cat[:, start : start + slice_stride].unsqueeze(0) + overlap_next = output_wavs[i : i + 1, :, :slice_stride] + pw_losses = pairwise_neg_sisdr(overlap_next, overlap_prev) + _, best_indices = PITLossWrapper.find_best_perm(pw_losses) + reordered = PITLossWrapper.reorder_source(output_wavs[i : i + 1, :, :], best_indices) + output_cat[:, start : start + slice_size] += reordered.squeeze(0) + output_cat[:, start : start + slice_stride] /= 2 + start += slice_stride + return output_cat[:, :T] + + +class DPRNN_MultiStage(nn.Module): + """Implementation of the Dual-Path-RNN model, + with multi-stage output, without Conv2D projection + """ + + def __init__( + self, + in_chan, + bn_chan, + hid_size, + chunk_size, + hop_size, + n_repeats, + norm_type, + bidirectional, + rnn_type, + use_mulcat, + num_layers, + dropout, + ): + super(DPRNN_MultiStage, self).__init__() + self.in_chan = in_chan + self.bn_chan = bn_chan + self.hid_size = hid_size + self.chunk_size = chunk_size + self.hop_size = hop_size + self.n_repeats = n_repeats + self.norm_type = norm_type + self.bidirectional = bidirectional + self.rnn_type = rnn_type + self.num_layers = num_layers + self.dropout = dropout + self.use_mulcat = use_mulcat + + layer_norm = norms.get(norm_type)(in_chan) + bottleneck_conv = nn.Conv1d(in_chan, bn_chan, 1) + self.bottleneck = nn.Sequential(layer_norm, bottleneck_conv) + + # Succession of DPRNNBlocks. + self.net = nn.ModuleList([]) + for i in range(self.n_repeats): + self.net.append( + DPRNNBlock( + bn_chan, + hid_size, + norm_type=norm_type, + bidirectional=bidirectional, + rnn_type=rnn_type, + use_mulcat=use_mulcat, + num_layers=num_layers, + dropout=dropout, + ) + ) + + def forward(self, mixture_w): + """Forward. + Args: + mixture_w (:class:`torch.Tensor`): Tensor of shape $(batch, nfilters, nframes)$ + Returns: + list of (:class:`torch.Tensor`): Tensor of shape $(batch, bn_chan, chunk_size, n_chunks) + """ + batch, n_filters, n_frames = mixture_w.size() + output = self.bottleneck(mixture_w) # [batch, bn_chan, n_frames] + output = unfold( + output.unsqueeze(-1), + kernel_size=(self.chunk_size, 1), + padding=(self.chunk_size, 0), + stride=(self.hop_size, 1), + ) + n_chunks = output.shape[-1] + output = output.reshape(batch, self.bn_chan, self.chunk_size, n_chunks) + # Apply stacked DPRNN Blocks sequentially + output_list = [] + for i in range(self.n_repeats): + output = self.net[i](output) + output_list.append(output) + return output_list + + +class SingleDecoder(nn.Module): + """ + Base decoder module, including the projection layer from (bn_chan) to (n_src * bn_chan). + Takes a single example mask and encoding, outputs waveform + """ + + def __init__( + self, kernel_size, stride, in_chan, n_src, bn_chan, chunk_size, hop_size, mask_act + ): + super(SingleDecoder, self).__init__() + self.kernel_size = kernel_size + self.stride = stride + self.in_chan = in_chan + self.bn_chan = bn_chan + self.chunk_size = chunk_size + self.hop_size = hop_size + self.n_src = n_src + self.mask_act = mask_act + + # Masking in 3D space + net_out_conv = nn.Conv2d(bn_chan, n_src * bn_chan, 1) + self.first_out = nn.Sequential(nn.PReLU(), net_out_conv) + # Gating and masking in 2D space (after fold) + self.net_out = nn.Sequential(nn.Conv1d(bn_chan, bn_chan, 1), nn.Tanh()) + self.net_gate = nn.Sequential(nn.Conv1d(bn_chan, bn_chan, 1), nn.Sigmoid()) + self.mask_net = nn.Conv1d(bn_chan, in_chan, 1, bias=False) + + # Get activation function. + mask_nl_class = activations.get(mask_act) + # For softmax, feed the source dimension. + if has_arg(mask_nl_class, "dim"): + self.output_act = mask_nl_class(dim=1) + else: + self.output_act = mask_nl_class() + + _, self.trans_conv = make_enc_dec( + "free", kernel_size=kernel_size, stride=stride, n_filters=in_chan + ) + + def forward(self, output, mixture_w): + """ + Args: + output: LSTM output, Tensor of shape $(num_stages, bn_chan, chunk_size, n_chunks)$ + mixture_w: Encoder output, Tensor of shape $(num_stages, in_chan, nframes) + outputs: + est_wavs: Signal, Tensor of shape $(num_stages, n_src, T) + """ + batch, bn_chan, chunk_size, n_chunks = output.size() + _, in_chan, n_frames = mixture_w.size() + assert self.bn_chan == bn_chan + assert self.in_chan == in_chan + assert self.chunk_size == chunk_size + output = self.first_out(output) + output = output.reshape(batch * self.n_src, self.bn_chan, self.chunk_size, n_chunks) + # Overlap and add: + # [batch, out_chan, chunk_size, n_chunks] -> [batch, out_chan, n_frames] + to_unfold = self.bn_chan * self.chunk_size + output = fold( + output.reshape(batch * self.n_src, to_unfold, n_chunks), + (n_frames, 1), + kernel_size=(self.chunk_size, 1), + padding=(self.chunk_size, 0), + stride=(self.hop_size, 1), + ) + # Apply gating + output = output.reshape(batch * self.n_src, self.bn_chan, -1) + output = self.net_out(output) * self.net_gate(output) + # Compute mask + score = self.mask_net(output) + est_mask = self.output_act(score) + est_mask = est_mask.reshape(batch, self.n_src, self.in_chan, n_frames) + mixture_w = mixture_w.unsqueeze(1) + source_w = est_mask * mixture_w + source_w = source_w.reshape(batch * self.n_src, self.in_chan, n_frames) + est_wavs = self.trans_conv(source_w) + est_wavs = est_wavs.reshape(batch, self.n_src, -1) + return est_wavs + + +class Decoder_Select(nn.Module): + """Selects which SingleDecoder to use, as well as whether to use multiloss, as proposed in [1] + References + [1] "Multi-Decoder DPRNN: High Accuracy Source Counting and Separation", + Junzhe Zhu, Raymond Yeh, Mark Hasegawa-Johnson. https://arxiv.org/abs/2011.12022 + """ + + def __init__( + self, kernel_size, stride, in_chan, n_srcs, bn_chan, chunk_size, hop_size, mask_act + ): + super().__init__() + self.kernel_size = kernel_size + self.stride = stride + self.in_chan = in_chan + self.n_srcs = n_srcs + self.bn_chan = bn_chan + self.chunk_size = chunk_size + self.hop_size = hop_size + self.mask_act = mask_act + + self.n_src2idx = {n_src: i for i, n_src in enumerate(n_srcs)} + self.decoders = torch.nn.ModuleList() + for n_src in n_srcs: + self.decoders.append( + SingleDecoder( + kernel_size=kernel_size, + stride=stride, + in_chan=in_chan, + n_src=n_src, + bn_chan=bn_chan, + chunk_size=chunk_size, + hop_size=hop_size, + mask_act=mask_act, + ) + ) + self.selector = nn.Sequential( + nn.Conv2d(bn_chan, in_chan, 1), + nn.AdaptiveAvgPool2d(1), + nn.ReLU(), + nn.Conv2d(in_chan, len(n_srcs), 1), + ) + + def forward(self, output_list, mixture_w, ground_truth): + """Forward + Args: + output_list: list of $(batch, bn_chan, chunk_size, n_chunks)$ + mixture_w: torch.Tensor, $(batch, in_chan, n_frames)$ + ground_truth: None, or list of [B] ints, or Long Tensor of $(B) + if None, use inferred number of speakers to determine output shape + Output: + output_wavs: torch.Tensor, $(batch, num_stages, max_spks, T)$ + where the speaker dimension is padded for examples with num_spks < max_spks + if training, num_stages=n_repeats; otherwise, num_stages=1 + selector_output: output logits from selector module. torch.Tensor, $(batch, num_stages, num_decoders)$ + """ + batch, bn_chan, chunk_size, n_chunks = output_list[0].size() + _, in_chan, n_frames = mixture_w.size() + assert self.chunk_size == chunk_size + if not self.training: + output_list = output_list[-1:] + num_stages = len(output_list) + # [batch, num_stages, bn_chan, chunk_size, n_chunks] + output = torch.stack(output_list, 1).reshape( + batch * num_stages, bn_chan, chunk_size, n_chunks + ) + selector_output = self.selector(output).reshape(batch, num_stages, -1) + output = output.reshape(batch, num_stages, bn_chan, chunk_size, n_chunks) + # [batch, num_stages, in_chan, n_frames] + mixture_w = mixture_w.unsqueeze(1).repeat(1, num_stages, 1, 1) + if ground_truth is not None: # oracle + decoder_selected = torch.LongTensor([self.n_src2idx[truth] for truth in ground_truth]) + else: + assert num_stages == 1 # can't use select with multistage + decoder_selected = selector_output.reshape(batch, -1).argmax(1) + T = self.kernel_size + self.stride * (n_frames - 1) + output_wavs = torch.zeros(batch, num_stages, max(self.n_srcs), T).to(output.device) + for i in range(batch): + output_wavs[i, :, : self.n_srcs[decoder_selected[i]], :] = self.decoders[ + decoder_selected[i] + ](output[i], mixture_w[i]) + return output_wavs, selector_output + + +def load_best_model(train_conf, exp_dir, sample_rate): + """Load best model after training. + + Args: + train_conf (dict): dictionary as expected by `make_model_and_optimizer` + exp_dir(str): Experiment directory. Expects to find + `'best_k_models.json'` of `checkpoints` directory in it. + + Returns: + nn.Module the best (or last) pretrained model according to the val_loss. + """ + # Create the model from recipe-local function + model, _ = make_model_and_optimizer(train_conf, sample_rate=sample_rate) + try: + # Last best model summary + with open(os.path.join(exp_dir, "best_k_models.json"), "r") as f: + best_k = json.load(f) + best_model_path = min(best_k, key=best_k.get) + except FileNotFoundError: + # Get last checkpoint + all_ckpt = os.listdir(os.path.join(exp_dir, "checkpoints/")) + all_ckpt = [ + (ckpt, int("".join(filter(str.isdigit, os.path.basename(ckpt))))) + for ckpt in all_ckpt + if ckpt.find("ckpt") >= 0 + ] + all_ckpt.sort(key=lambda x: x[1]) + best_model_path = os.path.join(exp_dir, "checkpoints", all_ckpt[-1][0]) + # Load checkpoint + checkpoint = torch.load(best_model_path, map_location="cpu") + # Load state_dict into model. + model = torch_utils.load_state_dict_in(checkpoint["state_dict"], model) + model.eval() + return model + + +# Training notes: +# Weight different stages in accordance with facebook code +if __name__ == "__main__": + network = MultiDecoderDPRNN(n_srcs=[2, 3], bn_chan=32, hid_size=32, n_filters=16) + # training + input = torch.rand(2, 3200) + wavs, selector_output = network(input, [3, 2]) + print(wavs.shape) + assert (wavs[1, :, 2] == 0).all() + # validation + network.eval() + wavs, selector_output = network(input) + print(wavs.shape) + # test + input_wav = torch.rand(64351) + output_wavs = network.forward_wav(input_wav) + print(output_wavs.shape) diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/run.sh b/egs/kinect-wsj/Multi-Decoder-DPRNN/run.sh new file mode 100755 index 000000000..ea5f44ec5 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/run.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +echo "pretrained model can be found at: https://huggingface.co/JunzheJosephZhu/MultiDecoderDPRNN" + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES + +# Data +#data_dir=data # Local data directory (No disk space needed) +sample_rate=16000 +mode=max +#n_srcs=(2 3 4 5) +n_srcs=(2) + +# Training +batch_size=32 +num_workers=8 +optimizer=rmsprop +lr=0.0001 +weight_decay=0.0 +epochs=200 +lambda=0.05 +resume_from= + +# Evaluation +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=2speakers/wav${sr_string}k/$mode +dumpdir=$data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi + +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=$( + IFS=$'' + echo "${n_srcs[*]}" + )sep_${sr_string}k${mode}_${uuid} +fi +expdir=exp/train_mddprnn_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + echo "visible cuda devices are ${id[*]}" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --sample_rate $sample_rate \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay \ + --epochs $epochs \ + --batch_size $batch_size \ + --lambda $lambda \ + --num_workers $num_workers \ + --exp_dir ${expdir}/ | tee logs/train_${tag}.log + --resume_from $resume_from + cp logs/train_${tag}.log $expdir/train.log +fi + +if [[ $stage -le 4 ]]; then + expdir=exp/tmp + echo "Stage 4 : Evaluation" + echo "If you want to change n_srcs, please change the config file" + CUDA_VISIBLE_DEVICES=$id $python_path eval.py \ + --test_dir $test_dir \ + --use_gpu $eval_use_gpu \ + --exp_dir ${expdir} | tee logs/eval_${tag}.log + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/train.py b/egs/kinect-wsj/Multi-Decoder-DPRNN/train.py new file mode 100644 index 000000000..ef04f3486 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/train.py @@ -0,0 +1,257 @@ +import os +import argparse +import json +import torch +from torch import nn +from torch.optim.lr_scheduler import ReduceLROnPlateau, ExponentialLR +from torch.nn.parallel import DistributedDataParallel +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.engine.system import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid.losses import deep_clustering_loss +from asteroid_filterbanks.transforms import mag +from asteroid.dsp.vad import ebased_vad +from kinect_wsj_var import make_dataloaders + +from model import make_model_and_optimizer +import numpy as np + +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") +parser.add_argument("--resume_from", default=None, help="Model to resume from") + + +def main(conf): + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + + model, optimizer = make_model_and_optimizer( + conf, sample_rate=conf["data"]["sample_rate"]) + scheduler = [] + if conf["training"]["half_lr"]: + scheduler.append( + ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5)) + if conf["training"]["lr_decay"]: + scheduler.append(ExponentialLR(optimizer=optimizer, gamma=0.99)) + exp_dir = conf["main_args"]["exp_dir"] + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + loss_func = WeightedPITLoss(n_srcs=conf["masknet"]["n_srcs"], + lamb=conf["loss"]["lambda"]) + # Put together in System + system = VarSpkrSystem( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint( + dirpath=checkpoint_dir, + filename="{epoch}-{step}", + monitor="avg_sdr", + mode="max", + save_top_k=5, + verbose=True, + ) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="avg_sdr", + mode="max", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "dp" if torch.cuda.is_available() else None + + # Train model + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=200, + resume_from_checkpoint=conf["main_args"]["resume_from"], + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + # Save last model for convenience + torch.save(system.model.state_dict(), + os.path.join(exp_dir, "final_model.pth")) + + +class VarSpkrSystem(System): + def common_step(self, batch, batch_nb, train=True): + mixture_tensor, source_tensor, ilens, num_sources = batch + pred_tensor, selector_output = self(mixture_tensor, + num_sources.tolist()) + batch_size, num_stages, _, T = pred_tensor.size() + avg_loss = 0 + spks_sdr = [] + accuracy = 0 + for i in range(batch_size): + est_src = pred_tensor[i, :, :num_sources[i], :ilens[i]] + src = source_tensor[i, :num_sources[i], :ilens[i]] + logits = selector_output[i] + loss, pos_sdr, correctness = self.loss_func(est_src, logits, src) + avg_loss = avg_loss + loss / batch_size + spks_sdr.append((num_sources[i], pos_sdr)) + accuracy += correctness / batch_size + return avg_loss, spks_sdr, accuracy + + def training_step(self, batch, batch_nb): + avg_loss, spks_sdr, accuracy = self.common_step(batch, + batch_nb, + train=True) + self.log("loss", avg_loss) + for num_spks, sdr in spks_sdr: + self.log(f"{num_spks}spks_sdr_tr", sdr) + self.log("acc_tr", accuracy) + return avg_loss + + def validation_step(self, batch, batch_nb): + avg_loss, spks_sdr, accuracy = self.common_step(batch, + batch_nb, + train=False) + self.log("val_loss", avg_loss) + for num_spks, sdr in spks_sdr: + self.log(f"{num_spks}spks_sdr_val", sdr) + self.log("acc_val", accuracy) + # SDR averaged across number of sources + avg_sdr = torch.mean(torch.Tensor([sdr for _, sdr in spks_sdr])) + self.log("avg_sdr", avg_sdr) + + def unpack_data(self, batch): + """ + Args: + batch: list, len(batch) = batch_size, each entry is a tuple of (mixture, sources) + Returns: + mixtures_tensor: B x T, torch.Tensor, padded mixtures + source_tensor: B x C x T, torch.Tensor, padded in both channel and time dimension + ilens : B, torch.Tensor, length of each mixture + num_sources : B, torch.Tensor, number of sources for each mixture + """ + + #print("*" * 100) + #print(len(batch)) + #mixture, source, noise = batch + #print("*"*100) + #print(mixture.shape) + #print(batch.shape) + ilens = [len(mixture[..., 0]) for mixture, source, noise in batch] + num_sources = [len(source[..., 0]) for mixture, source, noise in batch] + mixture_tensor = torch.zeros(len(batch), max(ilens)) + source_tensor = torch.zeros(len(batch), max(num_sources), max(ilens)) + + for i, (mixture, + sources) in enumerate(batch[:2]): # compute length to pad to + assert len(mixture) == len(sources[..., 0][0]) + mixture_tensor[i, :ilens[i]] = torch.Tensor(mixture[..., + 0]).float() + source_tensor[i, :num_sources[i], :ilens[i]] = torch.Tensor( + np.stack(sources[..., 0], axis=0)).float() + ilens = torch.Tensor(np.stack(ilens)).int() + num_sources = torch.Tensor(np.stack(num_sources)).int() + + return mixture_tensor, source_tensor, ilens, num_sources + ''' + ilens = [len(mixture) for mixture, _ in batch] + num_sources = [len(sources) for _, sources in batch] + mixture_tensor = torch.zeros(len(batch), max(ilens)) + source_tensor = torch.zeros(len(batch), max(num_sources), max(ilens)) + + for i, (mixture, sources) in enumerate(batch): # compute length to pad to + assert len(mixture) == len(sources[0]) + mixture_tensor[i, : ilens[i]] = torch.Tensor(mixture).float() + source_tensor[i, : num_sources[i], : ilens[i]] = torch.Tensor( + np.stack(sources, axis=0) + ).float() + ilens = torch.Tensor(np.stack(ilens)).int() + num_sources = torch.Tensor(np.stack(num_sources)).int() + + return mixture_tensor, source_tensor, ilens, num_sources + ''' + + +class WeightedPITLoss(nn.Module): + """ + This loss has two components. One is the standard PIT loss, with Si-SDR summed(not mean, but sum) over each source + under the best matching permutation. The other component is the classification loss, which is cross entropy for the + speaker number classification head network. + """ + def __init__(self, n_srcs, lamb=0.05): + super().__init__() + self.n_src2idx = {n_src: i for i, n_src in enumerate(n_srcs)} + self.cce = nn.CrossEntropyLoss(reduction="none") + self.lamb = lamb + + def forward(self, est_src, logits, src): + """Forward + Args: + est_src: $(num_stages, n_src, T) + logits: $(num_stages, num_decoders) + src: $(n_src, T) + """ + assert est_src.size()[1:] == src.size() + num_stages, n_src, T = est_src.size() + target_src = src.unsqueeze(0).repeat(num_stages, 1, 1) + target_idx = self.n_src2idx[n_src] + + pw_losses = pairwise_neg_sisdr(est_src, target_src) + sdr_loss, _ = PITLossWrapper.find_best_perm(pw_losses) + pos_sdr = -sdr_loss[-1] + + cls_target = torch.LongTensor([target_idx] * num_stages).to( + logits.device) + cls_loss = self.cce(logits, cls_target) + correctness = logits[-1].argmax().item() == target_idx + + coeffs = torch.Tensor([ + (c_idx + 1) * (1 / num_stages) for c_idx in range(num_stages) + ]).to(logits.device) + assert coeffs.size() == sdr_loss.size() == cls_loss.size() + # use sum of SDR for each channel, not mean + loss = torch.sum(coeffs * (sdr_loss * n_src + cls_loss * self.lamb)) + + return loss, pos_sdr, correctness + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/utils/parse_options.sh b/egs/kinect-wsj/Multi-Decoder-DPRNN/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/Multi-Decoder-DPRNN/utils/prepare_python_env.sh b/egs/kinect-wsj/Multi-Decoder-DPRNN/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/Multi-Decoder-DPRNN/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/.DS_Store b/egs/kinect-wsj/SuDORMRFImprovedNet/.DS_Store new file mode 100644 index 000000000..da30f0f34 Binary files /dev/null and b/egs/kinect-wsj/SuDORMRFImprovedNet/.DS_Store differ diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/README.md b/egs/kinect-wsj/SuDORMRFImprovedNet/README.md new file mode 100644 index 000000000..e17edd007 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/README.md @@ -0,0 +1,2 @@ +# Results +Coming soon diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/eval.py b/egs/kinect-wsj/SuDORMRFImprovedNet/eval.py new file mode 100644 index 000000000..782e4c00f --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/eval.py @@ -0,0 +1,143 @@ +import os +import random +import soundfile as sf +import torch +import yaml +import json +import argparse +import pandas as pd +from tqdm import tqdm +from pprint import pprint + +from asteroid.metrics import get_metrics +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid import SuDORMRFImprovedNet +from asteroid.utils import tensors_to_device +from asteroid.dsp.normalization import normalize_estimates +from asteroid.data import KinectWsjMixDataset + +parser = argparse.ArgumentParser() +parser.add_argument("--test_dir", + type=str, + required=True, + help="Test directory including the csv files") +parser.add_argument("--n_src", type=int, default=2) +parser.add_argument( + "--out_dir", + type=str, + required=True, + help="Directory in exp_dir where the eval results" + " will be stored", +) +parser.add_argument("--use_gpu", + type=int, + default=0, + help="Whether to use the GPU for model execution") +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument("--n_save_ex", + type=int, + default=10, + help="Number of audio examples to save, -1 means all") + +compute_metrics = ["si_sdr", "sdr", "sir", "sar", "stoi"] + + +def main(conf): + model_path = os.path.join(conf["exp_dir"], "best_model.pth") + model = SuDORMRFImprovedNet.from_pretrained(model_path) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + test_set = KinectWsjMixDataset(conf["test_dir"], + n_src=conf["n_src"], + segment=None) # Uses all segment length + # Used to reorder sources only + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + + # Randomly choose the indexes of sentences to save. + eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"]) + ex_save_dir = os.path.join(eval_save_dir, "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources, noises = tensors_to_device(test_set[idx], + device=model_device) + mix = mix[..., 0] + sources = sources[..., 0] + #est_sources = model(mix.unsqueeze(0)) + est_sources = model.separate(mix[None, None]) + loss, reordered_sources = loss_func(est_sources, + sources[None], + return_est=True) + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = reordered_sources.squeeze(0).cpu().data.numpy() + # For each utterance, we get a dictionary with the mixture path, + # the input and output metrics + utt_metrics = get_metrics( + mix_np, + sources_np, + est_sources_np, + sample_rate=conf["sample_rate"], + metrics_list=compute_metrics, + ) + utt_metrics["mix_path"] = test_set.mix[idx][0] + est_sources_np_normalized = normalize_estimates(est_sources_np, mix_np) + + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], + conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx), src, + conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np_normalized): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(eval_save_dir, "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in compute_metrics: + input_metric_name = "input_" + metric_name + ldf = all_metrics_df[metric_name] - all_metrics_df[input_metric_name] + final_results[metric_name] = all_metrics_df[metric_name].mean() + final_results[metric_name + "_imp"] = ldf.mean() + + print("Overall metrics :") + pprint(final_results) + + with open(os.path.join(eval_save_dir, "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + main(arg_dic) diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/local/conf.yml b/egs/kinect-wsj/SuDORMRFImprovedNet/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/local/convert_sphere2wav.sh b/egs/kinect-wsj/SuDORMRFImprovedNet/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/SuDORMRFImprovedNet/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/run.sh b/egs/kinect-wsj/SuDORMRFImprovedNet/run.sh new file mode 100644 index 000000000..66272c157 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/run.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES +out_dir=kinect_wsj # Controls the directory name associated to the evaluation results inside the experiment directory + +# Network config + +# Training config +epochs=200 +batch_size=12 +num_workers=4 +half_lr=yes +early_stop=yes +# Optim config +optimizer=adam +lr=0.001 +weight_decay=0. +# Data config +sample_rate=16000 +mode=max +n_src=2 +#segment=3 +#task=enh_single # one of 'enh_single', 'enh_both', 'sep_clean', 'sep_noisy' + +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=${n_src}speakers/wav${sr_string}k/$mode +dumpdir=$data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi + +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=${uuid} +fi + +expdir=exp/train_sudormrfimproved_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py --exp_dir $expdir \ + --epochs $epochs \ + --batch_size $batch_size \ + --num_workers $num_workers \ + --half_lr $half_lr \ + --early_stop $early_stop \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --sample_rate $sample_rate \ + --n_src $n_src | tee logs/train_${tag}.log + cp logs/train_${tag}.log $expdir/train.log + + # Get ready to publish + mkdir -p $expdir/publish_dir + echo "kinect_wsj/SuDORMRF" >$expdir/publish_dir/recipe_name.txt +fi + +if [[ $stage -le 4 ]]; then + echo "Stage 4: Evaluation" + + $python_path eval.py \ + --exp_dir $expdir \ + --test_dir $test_dir \ + --out_dir $out_dir \ + --use_gpu $eval_use_gpu | tee logs/eval_${tag}.log + + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/train.py b/egs/kinect-wsj/SuDORMRFImprovedNet/train.py new file mode 100644 index 000000000..f65a10edf --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/train.py @@ -0,0 +1,124 @@ +import os +import argparse +import json + +import torch +from torch.optim.lr_scheduler import ReduceLROnPlateau +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.models import SuDORMRFImprovedNet +from asteroid.engine.optimizers import make_optimizer +from asteroid.engine.system_kinect_wsj import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid.data.kinect_wsj import make_dataloaders + +# Keys which are not in the conf.yml file can be added here. +# In the hierarchical dictionary created when parsing, the key `key` can be +# found at dic['main_args'][key] + +# By default train.py will use all available GPUs. The `id` option in run.sh +# will limit the number of available GPUs for train.py . +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") + + +def main(conf): + conf["masknet"].update({"n_src": conf["data"]["n_src"]}) + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + model = SuDORMRFImprovedNet(**conf["filterbank"], + **conf["masknet"], + sample_rate=conf["data"]["sample_rate"]) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + # Define scheduler + scheduler = None + if conf["training"]["half_lr"]: + scheduler = ReduceLROnPlateau(optimizer=optimizer, + factor=0.5, + patience=5) + # Just after instantiating, save the args. Easy loading in the future. + exp_dir = conf["main_args"]["exp_dir"] + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + + # Define Loss function. + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + system = System( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint(checkpoint_dir, + monitor="val_loss", + mode="min", + save_top_k=5, + verbose=True) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="val_loss", + mode="min", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "ddp" if torch.cuda.is_available() else None + + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=5.0, + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + + state_dict = torch.load(checkpoint.best_model_path) + system.load_state_dict(state_dict=state_dict["state_dict"]) + system.cpu() + + to_save = system.model.serialize() + #to_save.update(train_set.get_infos()) + torch.save(to_save, os.path.join(exp_dir, "best_model.pth")) + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/utils/parse_options.sh b/egs/kinect-wsj/SuDORMRFImprovedNet/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/SuDORMRFImprovedNet/utils/prepare_python_env.sh b/egs/kinect-wsj/SuDORMRFImprovedNet/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFImprovedNet/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file diff --git a/egs/kinect-wsj/SuDORMRFNet/.DS_Store b/egs/kinect-wsj/SuDORMRFNet/.DS_Store new file mode 100644 index 000000000..da30f0f34 Binary files /dev/null and b/egs/kinect-wsj/SuDORMRFNet/.DS_Store differ diff --git a/egs/kinect-wsj/SuDORMRFNet/README.md b/egs/kinect-wsj/SuDORMRFNet/README.md new file mode 100644 index 000000000..e17edd007 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/README.md @@ -0,0 +1,2 @@ +# Results +Coming soon diff --git a/egs/kinect-wsj/SuDORMRFNet/eval.py b/egs/kinect-wsj/SuDORMRFNet/eval.py new file mode 100644 index 000000000..57721e445 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/eval.py @@ -0,0 +1,143 @@ +import os +import random +import soundfile as sf +import torch +import yaml +import json +import argparse +import pandas as pd +from tqdm import tqdm +from pprint import pprint + +from asteroid.metrics import get_metrics +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid import SuDORMRFNet +from asteroid.utils import tensors_to_device +from asteroid.dsp.normalization import normalize_estimates +from asteroid.data import KinectWsjMixDataset + +parser = argparse.ArgumentParser() +parser.add_argument("--test_dir", + type=str, + required=True, + help="Test directory including the csv files") +parser.add_argument("--n_src", type=int, default=2) +parser.add_argument( + "--out_dir", + type=str, + required=True, + help="Directory in exp_dir where the eval results" + " will be stored", +) +parser.add_argument("--use_gpu", + type=int, + default=0, + help="Whether to use the GPU for model execution") +parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root") +parser.add_argument("--n_save_ex", + type=int, + default=10, + help="Number of audio examples to save, -1 means all") + +compute_metrics = ["si_sdr", "sdr", "sir", "sar", "stoi"] + + +def main(conf): + model_path = os.path.join(conf["exp_dir"], "best_model.pth") + model = SuDORMRFNet.from_pretrained(model_path) + # Handle device placement + if conf["use_gpu"]: + model.cuda() + model_device = next(model.parameters()).device + test_set = KinectWsjMixDataset(conf["test_dir"], + n_src=conf["n_src"], + segment=None) # Uses all segment length + # Used to reorder sources only + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + + # Randomly choose the indexes of sentences to save. + eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"]) + ex_save_dir = os.path.join(eval_save_dir, "examples/") + if conf["n_save_ex"] == -1: + conf["n_save_ex"] = len(test_set) + save_idx = random.sample(range(len(test_set)), conf["n_save_ex"]) + series_list = [] + torch.no_grad().__enter__() + for idx in tqdm(range(len(test_set))): + # Forward the network on the mixture. + mix, sources, noises = tensors_to_device(test_set[idx], + device=model_device) + mix = mix[..., 0] + sources = sources[..., 0] + #est_sources = model(mix.unsqueeze(0)) + est_sources = model.separate(mix[None, None]) + loss, reordered_sources = loss_func(est_sources, + sources[None], + return_est=True) + mix_np = mix[None].cpu().data.numpy() + sources_np = sources.cpu().data.numpy() + est_sources_np = reordered_sources.squeeze(0).cpu().data.numpy() + # For each utterance, we get a dictionary with the mixture path, + # the input and output metrics + utt_metrics = get_metrics( + mix_np, + sources_np, + est_sources_np, + sample_rate=conf["sample_rate"], + metrics_list=compute_metrics, + ) + utt_metrics["mix_path"] = test_set.mix[idx][0] + est_sources_np_normalized = normalize_estimates(est_sources_np, mix_np) + + series_list.append(pd.Series(utt_metrics)) + + # Save some examples in a folder. Wav files and metrics as text. + if idx in save_idx: + local_save_dir = os.path.join(ex_save_dir, "ex_{}/".format(idx)) + os.makedirs(local_save_dir, exist_ok=True) + sf.write(local_save_dir + "mixture.wav", mix_np[0], + conf["sample_rate"]) + # Loop over the sources and estimates + for src_idx, src in enumerate(sources_np): + sf.write(local_save_dir + "s{}.wav".format(src_idx), src, + conf["sample_rate"]) + for src_idx, est_src in enumerate(est_sources_np_normalized): + sf.write( + local_save_dir + "s{}_estimate.wav".format(src_idx), + est_src, + conf["sample_rate"], + ) + # Write local metrics to the example folder. + with open(local_save_dir + "metrics.json", "w") as f: + json.dump(utt_metrics, f, indent=0) + + # Save all metrics to the experiment folder. + all_metrics_df = pd.DataFrame(series_list) + all_metrics_df.to_csv(os.path.join(eval_save_dir, "all_metrics.csv")) + + # Print and save summary metrics + final_results = {} + for metric_name in compute_metrics: + input_metric_name = "input_" + metric_name + ldf = all_metrics_df[metric_name] - all_metrics_df[input_metric_name] + final_results[metric_name] = all_metrics_df[metric_name].mean() + final_results[metric_name + "_imp"] = ldf.mean() + + print("Overall metrics :") + pprint(final_results) + + with open(os.path.join(eval_save_dir, "final_metrics.json"), "w") as f: + json.dump(final_results, f, indent=0) + + +if __name__ == "__main__": + args = parser.parse_args() + arg_dic = dict(vars(args)) + # Load training config + conf_path = os.path.join(args.exp_dir, "conf.yml") + with open(conf_path) as f: + train_conf = yaml.safe_load(f) + arg_dic["sample_rate"] = train_conf["data"]["sample_rate"] + arg_dic["train_conf"] = train_conf + + main(arg_dic) diff --git a/egs/kinect-wsj/SuDORMRFNet/local/conf.yml b/egs/kinect-wsj/SuDORMRFNet/local/conf.yml new file mode 100644 index 000000000..fe981d697 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/local/conf.yml @@ -0,0 +1,33 @@ +# Filterbank config +filterbank: + n_filters: 512 + kernel_size: 512 + stride: 256 +# Network config +masknet: + rnn_type: lstm + n_layers: 4 + hidden_size: 600 + dropout: 0.3 + embedding_dim: 40 + take_log: y +# Training config +training: + epochs: 200 + batch_size: 32 + num_workers: 12 + half_lr: yes + early_stop: yes + loss_alpha: 1.0 # DC loss weight : 1.0 => DC, <1.0 => Chimera +# Optim config +optim: + optimizer: rmsprop + lr: 0.00001 + weight_decay: 0.00000 +# momentum: 0.9 +# Data config +data: + train_dir: data/wav16k/max/tr/ + valid_dir: data/wav16k/max/cv/ + n_src: 2 + sample_rate: 16000 diff --git a/egs/kinect-wsj/SuDORMRFNet/local/convert_sphere2wav.sh b/egs/kinect-wsj/SuDORMRFNet/local/convert_sphere2wav.sh new file mode 100644 index 000000000..8870bf096 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/local/convert_sphere2wav.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# MIT Copyright (c) 2018 Kaituo XU + + +sphere_dir=tmp +wav_dir=tmp + +. utils/parse_options.sh || exit 1; + + +echo "Download sph2pipe_v2.5 into egs/tools" +mkdir -p ../../tools +wget http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz -P ../../tools +cd ../../tools && tar -xzvf sph2pipe_v2.5.tar.gz && gcc -o sph2pipe_v2.5/sph2pipe sph2pipe_v2.5/*.c -lm && cd - + +echo "Convert sphere format to wav format" +sph2pipe=../../tools/sph2pipe_v2.5/sph2pipe + +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +tmp=data/local/ +mkdir -p $tmp + +[ ! -f $tmp/sph.list ] && find $sphere_dir -iname '*.wv*' | grep -e 'si_tr_s' -e 'si_dt_05' -e 'si_et_05' > $tmp/sph.list + +if [ ! -d $wav_dir ]; then + while read line; do + wav=`echo "$line" | sed "s:wv1:wav:g" | awk -v dir=$wav_dir -F'/' '{printf("%s/%s/%s/%s", dir, $(NF-2), $(NF-1), $NF)}'` + echo $wav + mkdir -p `dirname $wav` + $sph2pipe -f wav $line > $wav + done < $tmp/sph.list > $tmp/wav.list +else + echo "Do you already get wav files? if not, please remove $wav_dir" +fi diff --git a/egs/kinect-wsj/SuDORMRFNet/local/preprocess_kinect_wsj.py b/egs/kinect-wsj/SuDORMRFNet/local/preprocess_kinect_wsj.py new file mode 100644 index 000000000..35ef4737e --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/local/preprocess_kinect_wsj.py @@ -0,0 +1,48 @@ +import argparse +import json +import os +import soundfile as sf + + +def preprocess_one_dir(in_dir, out_dir, out_filename): + """ Create .json file for one condition.""" + file_infos = [] + in_dir = os.path.abspath(in_dir) + wav_list = os.listdir(in_dir) + wav_list.sort() + for wav_file in wav_list: + if not wav_file.endswith(".wav"): + continue + wav_path = os.path.join(in_dir, wav_file) + samples = sf.SoundFile(wav_path) + file_infos.append((wav_path, len(samples))) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(os.path.join(out_dir, out_filename + ".json"), "w") as f: + json.dump(file_infos, f, indent=4) + + +def preprocess(inp_args): + """ Create .json files for all conditions.""" + speaker_list = ["mix"] + [f"s{n+1}" for n in range(inp_args.n_src)] + for data_type in ["tr", "cv", "tt"]: + for spk in speaker_list: + preprocess_one_dir( + os.path.join(inp_args.in_dir, data_type, spk), + os.path.join(inp_args.out_dir, data_type), + spk, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Kinect-WSJ data preprocessing") + parser.add_argument( + "--in_dir", type=str, default=None, help="Directory path of wham including tr, cv and tt" + ) + parser.add_argument("--n_src", type=int, default=2, help="Number of sources in wsj0-mix") + parser.add_argument( + "--out_dir", type=str, default=None, help="Directory path to put output files" + ) + args = parser.parse_args() + print(args) + preprocess(args) diff --git a/egs/kinect-wsj/SuDORMRFNet/run.sh b/egs/kinect-wsj/SuDORMRFNet/run.sh new file mode 100644 index 000000000..d4e6fefa4 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/run.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Exit on error +set -e +set -o pipefail + +# Main storage directory. You'll need disk space to dump the WHAM mixtures and the wsj0 wav +# files if you start from sphere files. +storage_dir= +# If you start from the sphere files, specify the path to the directory and start from stage 0 +sphere_dir= # Directory containing sphere files +# If you already have wsj0 wav files (converted from sphere format). +wsj0_wav_dir="/srv/storage/talc3@talc-data.nancy/multispeech/calcul/users/mpariente/DATA/wsj0_wav/" +# If you already have kinect_wsj specify the path in the kinect_wsj_path and and start from stage 2. +wsj0mix_wav_dir= +chime_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/speech_recognition/CHiME5/audio/" +dihard_path="/srv/storage/talc@talc-data.nancy/multispeech/corpus/DIHARD2/LDC2019E31_Second_DIHARD_Challenge_Development_Data/data/multichannel/sad/" +# Path to save the data +kinect_wsj_path="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/dataset/2speakers_reverb_kinect" +# After running the recipe a first time, you can run it from stage 3 directly to train new models. +# Path to final kinect-wsj data, run from stage 3 +data="/srv/storage/talc2@talc-data2.nancy.grid5000.fr/multispeech/calcul/users/ccui/asteroid/egs/kinect-wsj/DeepClustering/data" + +# Path to the python you'll use for the experiment. Defaults to the current python +# You can run ./utils/prepare_python_env.sh to create a suitable python environment, paste the output here. +python_path=python +# Example usage +# ./run.sh --stage 3 --tag my_tag --task sep_noisy --id 0,1 + +# General +stage=3 # Controls from which stage to start +tag="" # Controls the directory name associated to the experiment +# You can ask for several GPUs using id (passed to CUDA_VISIBLE_DEVICES) +id=$CUDA_VISIBLE_DEVICES +out_dir=kinect_wsj # Controls the directory name associated to the evaluation results inside the experiment directory + +# Network config + +# Training config +epochs=200 +batch_size=1 +num_workers=4 +half_lr=yes +early_stop=yes +# Optim config +optimizer=adam +lr=0.001 +weight_decay=0. +# Data config +sample_rate=16000 +mode=max +n_src=2 +#segment=3 +#task=enh_single # one of 'enh_single', 'enh_both', 'sep_clean', 'sep_noisy' + +eval_use_gpu=1 + +. utils/parse_options.sh + +sr_string=$(($sample_rate / 1000)) +suffix=${n_src}speakers/wav${sr_string}k/$mode +dumpdir=$data/$suffix # directory to put generated json file + +train_dir=$dumpdir/tr +valid_dir=$dumpdir/cv +test_dir=$dumpdir/tt + +if [[ $stage -le 0 ]]; then + # echo "Create wsj0-mix files and start again from stage 1"; exit 1 + mkdir -p $dumpdir/kinect_wsj/code + git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code +fi + +if [[ $stage -le 1 ]]; then + # mkdir -p $dumpdir/kinect_wsj/code + # git clone https://github.com/sunits/Reverberated_WSJ_2MIX.git $dumpdir/kinect_wsj/code + cd $dumpdir/kinect_wsj/code + echo "Stage 1: create_corrupted_speech" + ./create_corrupted_speech.sh --stage 0 --wsj_data_path $wsj0_wav_dir \ + --chime5_wav_base $chime_path \ + --dihard_sad_label_path $dihard_path --dest $kinect_wsj_path +fi + +if [[ $stage -le 2 ]]; then + # Make json directories with min/max modes and sampling rates + echo "Stage 2: Generating json files including wav path and duration" + for sr_string in 16; do + for mode_option in max; do + #for mode_option in min; do + for tmp_nsrc in 2; do + tmp_dumpdir=data/${tmp_nsrc}speakers/wav${sr_string}k/$mode_option + echo "Generating json files in $tmp_dumpdir" + [[ ! -d $tmp_dumpdir ]] && mkdir -p $tmp_dumpdir + #local_kinect_dir=$kinect_wsj_path/wav${sr_string}k/$mode_option/ + local_kinect_dir=$kinect_wsj_path/2speakers_reverb_kinect_chime_noise_corrected/wav${sr_string}k/$mode_option/ + $python_path local/preprocess_kinect_wsj.py --in_dir $local_kinect_dir --n_src $tmp_nsrc \ + --out_dir $tmp_dumpdir + done + done + done +fi + +# Generate a random ID for the run if no tag is specified +uuid=$($python_path -c 'import uuid, sys; print(str(uuid.uuid4())[:8])') +if [[ -z ${tag} ]]; then + tag=${uuid} +fi + +expdir=exp/train_sudormrf_${tag} +mkdir -p $expdir && echo $uuid >>$expdir/run_uuid.txt +echo "Results from the following experiment will be stored in $expdir" + +if [[ $stage -le 3 ]]; then + echo "Stage 3: Training" + mkdir -p logs + CUDA_VISIBLE_DEVICES=$id $python_path train.py --exp_dir $expdir \ + --epochs $epochs \ + --batch_size $batch_size \ + --num_workers $num_workers \ + --half_lr $half_lr \ + --early_stop $early_stop \ + --optimizer $optimizer \ + --lr $lr \ + --weight_decay $weight_decay \ + --train_dir $train_dir \ + --valid_dir $valid_dir \ + --sample_rate $sample_rate \ + --n_src $n_src | tee logs/train_${tag}.log + cp logs/train_${tag}.log $expdir/train.log + + # Get ready to publish + mkdir -p $expdir/publish_dir + echo "kinect_wsj/SuDORMRF" >$expdir/publish_dir/recipe_name.txt +fi + +if [[ $stage -le 4 ]]; then + echo "Stage 4 : Evaluation" + + $python_path eval.py \ + --exp_dir $expdir \ + --test_dir $test_dir \ + --out_dir $out_dir \ + --use_gpu $eval_use_gpu | tee logs/eval_${tag}.log + + cp logs/eval_${tag}.log $expdir/eval.log +fi diff --git a/egs/kinect-wsj/SuDORMRFNet/train.py b/egs/kinect-wsj/SuDORMRFNet/train.py new file mode 100644 index 000000000..93483d254 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/train.py @@ -0,0 +1,124 @@ +import os +import argparse +import json + +import torch +from torch.optim.lr_scheduler import ReduceLROnPlateau +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping + +from asteroid.models import SuDORMRFNet +from asteroid.engine.optimizers import make_optimizer +from asteroid.engine.system_kinect_wsj import System +from asteroid.losses import PITLossWrapper, pairwise_neg_sisdr +from asteroid.data.kinect_wsj import make_dataloaders + +# Keys which are not in the conf.yml file can be added here. +# In the hierarchical dictionary created when parsing, the key `key` can be +# found at dic['main_args'][key] + +# By default train.py will use all available GPUs. The `id` option in run.sh +# will limit the number of available GPUs for train.py . +parser = argparse.ArgumentParser() +parser.add_argument("--exp_dir", + default="exp/tmp", + help="Full path to save best validation model") + + +def main(conf): + conf["masknet"].update({"n_src": conf["data"]["n_src"]}) + train_loader, val_loader = make_dataloaders(**conf["data"], + **conf["training"]) + model = SuDORMRFNet(**conf["filterbank"], + **conf["masknet"], + sample_rate=conf["data"]["sample_rate"]) + optimizer = make_optimizer(model.parameters(), **conf["optim"]) + # Define scheduler + scheduler = None + if conf["training"]["half_lr"]: + scheduler = ReduceLROnPlateau(optimizer=optimizer, + factor=0.5, + patience=5) + # Just after instantiating, save the args. Easy loading in the future. + exp_dir = conf["main_args"]["exp_dir"] + os.makedirs(exp_dir, exist_ok=True) + conf_path = os.path.join(exp_dir, "conf.yml") + with open(conf_path, "w") as outfile: + yaml.safe_dump(conf, outfile) + + # Define Loss function. + loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") + system = System( + model=model, + loss_func=loss_func, + optimizer=optimizer, + train_loader=train_loader, + val_loader=val_loader, + scheduler=scheduler, + config=conf, + ) + + # Define callbacks + callbacks = [] + checkpoint_dir = os.path.join(exp_dir, "checkpoints/") + checkpoint = ModelCheckpoint(checkpoint_dir, + monitor="val_loss", + mode="min", + save_top_k=5, + verbose=True) + callbacks.append(checkpoint) + if conf["training"]["early_stop"]: + callbacks.append( + EarlyStopping(monitor="val_loss", + mode="min", + patience=30, + verbose=True)) + + # Don't ask GPU if they are not available. + gpus = -1 if torch.cuda.is_available() else None + distributed_backend = "ddp" if torch.cuda.is_available() else None + + trainer = pl.Trainer( + max_epochs=conf["training"]["epochs"], + callbacks=callbacks, + default_root_dir=exp_dir, + gpus=gpus, + distributed_backend=distributed_backend, + limit_train_batches=1.0, # Useful for fast experiment + gradient_clip_val=5.0, + ) + trainer.fit(system) + + best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: + json.dump(best_k, f, indent=0) + + state_dict = torch.load(checkpoint.best_model_path) + system.load_state_dict(state_dict=state_dict["state_dict"]) + system.cpu() + + to_save = system.model.serialize() + #to_save.update(train_set.get_infos()) + torch.save(to_save, os.path.join(exp_dir, "best_model.pth")) + + +if __name__ == "__main__": + import yaml + from pprint import pprint + from asteroid.utils import prepare_parser_from_dict, parse_args_as_dict + + # We start with opening the config file conf.yml as a dictionary from + # which we can create parsers. Each top level key in the dictionary defined + # by the YAML file creates a group in the parser. + with open("local/conf.yml") as f: + def_conf = yaml.safe_load(f) + parser = prepare_parser_from_dict(def_conf, parser=parser) + # Arguments are then parsed into a hierarchical dictionary (instead of + # flat, as returned by argparse) to facilitate calls to the different + # asteroid methods (see in main). + # plain_args is the direct output of parser.parse_args() and contains all + # the attributes in an non-hierarchical structure. It can be useful to also + # have it so we included it here but it is not used. + arg_dic, plain_args = parse_args_as_dict(parser, return_plain_args=True) + pprint(arg_dic) + main(arg_dic) diff --git a/egs/kinect-wsj/SuDORMRFNet/utils/parse_options.sh b/egs/kinect-wsj/SuDORMRFNet/utils/parse_options.sh new file mode 100755 index 000000000..c2c3b31f2 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/utils/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### Now we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. \ No newline at end of file diff --git a/egs/kinect-wsj/SuDORMRFNet/utils/prepare_python_env.sh b/egs/kinect-wsj/SuDORMRFNet/utils/prepare_python_env.sh new file mode 100755 index 000000000..3dc223334 --- /dev/null +++ b/egs/kinect-wsj/SuDORMRFNet/utils/prepare_python_env.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Usage ./utils/install_env.sh --install_dir A --asteroid_root B --pip_requires C +install_dir=~ +asteroid_root=../../../../ +pip_requires=../../../requirements.txt # Expects a requirement.txt + +. utils/parse_options.sh || exit 1 + +mkdir -p $install_dir +cd $install_dir +echo "Download and install latest version of miniconda3 into ${install_dir}" +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +bash Miniconda3-latest-Linux-x86_64.sh -b -p miniconda3 +pip_path=$PWD/miniconda3/bin/pip + +rm Miniconda3-latest-Linux-x86_64.sh +cd - + +if [[ ! -z ${pip_requires} ]]; then + $pip_path install -r $pip_requires +fi +$pip_path install soundfile +$pip_path install -e $asteroid_root +#$pip_path install ${asteroid_root}/\[""evaluate""\] +echo -e "\nAsteroid has been installed in editable mode. Feel free to apply your changes !" \ No newline at end of file