Skip to content

Commit 6e9d34f

Browse files
authored
Support Multi-speaker VITS (#131)
Support Multi-speaker VITS & Hi-Fi TTS dataset preprocessing
1 parent d37d8f1 commit 6e9d34f

File tree

14 files changed

+191
-48
lines changed

14 files changed

+191
-48
lines changed

Diff for: bins/tts/preprocess.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,11 @@ def extract_phonme_sequences(dataset, output_path, cfg, dataset_types):
8888
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
8989
with open(dataset_file, "r") as f:
9090
metadata.extend(json.load(f))
91-
phone_extractor.extract_utt_phone_sequence(cfg, metadata)
91+
phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata)
9292

9393

9494
def preprocess(cfg, args):
95-
"""Proprocess raw data of single or multiple datasets (in cfg.dataset)
95+
"""Preprocess raw data of single or multiple datasets (in cfg.dataset)
9696
9797
Args:
9898
cfg (dict): dictionary that stores configurations

Diff for: config/tts.json

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// Directory names of processed data or extracted features
1717
"phone_dir": "phones",
1818
"use_phone": true,
19+
"add_blank": true
1920
},
2021
"model": {
2122
"text_token_num": 512,

Diff for: egs/datasets/README.md

+31
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Amphion support the following academic datasets (sort alphabetically):
66
- [AudioCaps](#audiocaps)
77
- [CSD](#csd)
88
- [CustomSVCDataset](#customsvcdataset)
9+
- [Hi-Fi TTS](#hifitts)
910
- [KiSing](#kising)
1011
- [LibriLight](#librilight)
1112
- [LibriTTS](#libritts)
@@ -75,6 +76,36 @@ We support custom dataset for Singing Voice Conversion. Organize your data in th
7576
┣ ...
7677
```
7778

79+
80+
## Hi-Fi TTS
81+
82+
Download the official Hi-Fi TTS dataset [here](https://www.openslr.org/109/). The file structure looks like below:
83+
84+
```plaintext
85+
[Hi-Fi TTS dataset path]
86+
┣ audio
87+
┃ ┣ 11614_other {Speaker_ID}_{SNR_subset}
88+
┃ ┃ ┣ 10547 {Book_ID}
89+
┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0001.flac
90+
┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0003.flac
91+
┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0004.flac
92+
┃ ┃ ┃ ┣ ...
93+
┃ ┃ ┣ ...
94+
┃ ┣ ...
95+
┣ 92_manifest_clean_dev.json
96+
┣ 92_manifest_clean_test.json
97+
┣ 92_manifest_clean_train.json
98+
┣ ...
99+
┣ {Speaker_ID}_manifest_{SNR_subset}_{dataset_split}.json
100+
┣ ...
101+
┣ books_bandwidth.tsv
102+
┣ LICENSE.txt
103+
┣ readers_books_clean.txt
104+
┣ readers_books_other.txt
105+
┣ README.txt
106+
107+
```
108+
78109
## KiSing
79110

80111
Download the official KiSing dataset [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure looks like below:

Diff for: egs/tts/VITS/README.md

+58-21
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Speech)
44
[![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Speech)
55

6-
In this recipe, we will show how to train [VITS](https://arxiv.org/abs/2106.06103) using Amphion's infrastructure. VITS is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning.
6+
In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning.
77

88
There are four stages in total:
99

@@ -20,7 +20,7 @@ There are four stages in total:
2020
## 1. Data Preparation
2121
2222
### Dataset Download
23-
You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
23+
You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, Hi-Fi TTS, LibriTTS, etc. We strongly recommend using LJSpeech to train single-speaker TTS model for the first time. While for training multi-speaker TTS model for the first time, we would recommend using Hi-Fi TTS. The process of downloading dataset has been detailed [here](../../datasets/README.md).
2424
2525
### Configuration
2626
@@ -29,32 +29,41 @@ After downloading the dataset, you can set the dataset paths in `exp_config.jso
2929
```json
3030
"dataset": [
3131
"LJSpeech",
32+
//"hifitts"
3233
],
3334
"dataset_path": {
3435
// TODO: Fill in your dataset path
3536
"LJSpeech": "[LJSpeech dataset path]",
37+
//"hifitts": "[Hi-Fi TTS dataset path]
3638
},
3739
```
3840
3941
## 2. Features Extraction
4042
4143
### Configuration
4244
43-
Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
45+
In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, and specify the `processed_dir` for saving processed data. For preprocessing the multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`:
4446
4547
```json
4648
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
4749
"log_dir": "ckpts/tts",
4850
"preprocess": {
51+
//"extract_audio": true,
52+
"use_phone": true,
53+
// linguistic features
54+
"extract_phone": true,
55+
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
4956
// TODO: Fill in the output data path. The default value is "Amphion/data"
5057
"processed_dir": "data",
51-
...
58+
"sample_rate": 22050, //target sampling rate
59+
"valid_file": "valid.json", //validation set
60+
//"use_spkid": true, //use speaker ID to train multi-speaker TTS model
5261
},
5362
```
5463
5564
### Run
5665
57-
Run the `run.sh` as the preproces stage (set `--stage 1`):
66+
Run the `run.sh` as the preprocess stage (set `--stage 1`):
5867
5968
```bash
6069
sh egs/tts/VITS/run.sh --stage 1
@@ -66,17 +75,22 @@ sh egs/tts/VITS/run.sh --stage 1
6675
6776
### Configuration
6877
69-
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
78+
We provide the default hyparameters in the `exp_config.json`. They can work on a single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
79+
For training the multi-speaker TTS model, specify the `n_speakers` value to be greater (used for new speaker fine-tuning) than or equal to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`.
7080
71-
```
72-
"train": {
73-
"batch_size": 16,
74-
}
81+
```json
82+
"model": {
83+
//"n_speakers": 10 //Number of speakers in the dataset(s) used. The default value is 0 if not specified.
84+
},
85+
"train": {
86+
"batch_size": 16,
87+
//"multi_speaker_training": true,
88+
}
7589
```
7690
7791
### Train From Scratch
7892
79-
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
93+
Run the `run.sh` as the training stage (set `--stage 2`). Specify an experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
8094
8195
```bash
8296
sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName]
@@ -139,12 +153,35 @@ For inference, you need to specify the following configurations when running `ru
139153
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` |
140154
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` |
141155
| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
142-
| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
143-
| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. |
156+
| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`.<br> For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. |
157+
| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from the test set as template testing set.<br>For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. |
144158
| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
159+
| `--infer_speaker_name` | The target speaker's voice is to be synthesized.<br> (***Note: only applicable to multi-speaker TTS model***) | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`". <br> You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. |
145160
146161
### Run
147-
For example, if you want to generate speech of all testing set split from LJSpeech, just run:
162+
#### Single text inference:
163+
For the single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run:
164+
165+
```bash
166+
sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
167+
--infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
168+
--infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
169+
--infer_mode "single" \
170+
--infer_text "This is a clip of generated speech with the given text from a TTS model."
171+
```
172+
173+
For the multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run:
174+
```bash
175+
sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
176+
--infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
177+
--infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
178+
--infer_mode "single" \
179+
--infer_text "This is a clip of generated speech with the given text from a TTS model." \
180+
--infer_speaker_name "hifitts_92"
181+
```
182+
183+
#### Batch inference:
184+
For the single-speaker TTS model, if you want to generate speech of all testing sets split from LJSpeech, just run:
148185
149186
```bash
150187
sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
@@ -154,18 +191,18 @@ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
154191
--infer_dataset "LJSpeech" \
155192
--infer_testing_set "test"
156193
```
157-
158-
Or, if you want to generate a single clip of speech from a given text, just run:
159-
194+
For the multi-speaker TTS model, if you want to generate speech of all testing sets split from Hi-Fi TTS, the same procedure follows from above, with ```LJSpeech``` replaced by ```hifitts```.
160195
```bash
161196
sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
162197
--infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
163198
--infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
164-
--infer_mode "single" \
165-
--infer_text "This is a clip of generated speech with the given text from a TTS model."
199+
--infer_mode "batch" \
200+
--infer_dataset "hifitts" \
201+
--infer_testing_set "test"
166202
```
167203
168-
We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction.
204+
205+
We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. Meanwhile, the pre-trained multi-speaker VITS model trained on Hi-Fi TTS will be released soon. Stay tuned.
169206
170207
171208
```bibtex
@@ -176,4 +213,4 @@ We released a pre-trained Amphion VITS model trained on LJSpeech. So you can dow
176213
pages={5530--5540},
177214
year={2021},
178215
}
179-
```
216+
```

Diff for: egs/tts/VITS/exp_config.json

+14-7
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,33 @@
22
"base_config": "config/vits.json",
33
"model_type": "VITS",
44
"dataset": [
5-
"LJSpeech"
5+
"LJSpeech",
6+
//"hifitts"
67
],
78
"dataset_path": {
89
// TODO: Fill in your dataset path
9-
"LJSpeech": "[LJSpeech dataset path]"
10+
"LJSpeech": "[LJSpeech dataset path]",
11+
//"hifitts": "[Hi-Fi TTS dataset path]
1012
},
1113
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
1214
"log_dir": "ckpts/tts",
1315
"preprocess": {
16+
//"extract_audio":true,
1417
"use_phone": true,
1518
// linguistic features
1619
"extract_phone": true,
17-
"phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
20+
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
1821
// TODO: Fill in the output data path. The default value is "Amphion/data"
1922
"processed_dir": "data",
20-
21-
"sample_rate": 22050,
22-
"valid_file": "test.json", // validattion set
23+
"sample_rate": 22050, // target sampling rate
24+
"valid_file": "valid.json", // validation set
25+
//"use_spkid": true // use speaker ID to train multi-speaker TTS model
26+
},
27+
"model":{
28+
//"n_speakers": 10 // number of speakers, greater than or equal to the number of speakers in the dataset(s) used. The default value is 0 if not specified.
2329
},
2430
"train": {
2531
"batch_size": 16,
32+
//"multi_speaker_training": true
2633
}
27-
}
34+
}

Diff for: egs/tts/VITS/run.sh

+16-7
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ cd $work_dir
1818

1919
######## Parse the Given Parameters from the Commond ###########
2020
# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@")
21-
options=$(getopt -o c:n:s --long gpu:,config:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@")
21+
options=$(getopt -o c:n:s --long gpu:,config:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,infer_speaker_name:,name:,stage: -- "$@")
2222
eval set -- "$options"
2323

2424
while true; do
@@ -43,14 +43,16 @@ while true; do
4343
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
4444
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
4545
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
46-
# [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
46+
# [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generate a single clip of speech.
4747
--infer_mode) shift; infer_mode=$1 ; shift ;;
48-
# [Only for Inference] The inference dataset. It is only used when the inference model is "batch".
48+
# [Only for Inference] The inference dataset. It is only used when the inference mode is "batch".
4949
--infer_dataset) shift; infer_dataset=$1 ; shift ;;
50-
# [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set.
50+
# [Only for Inference] The inference testing set. It is only used when the inference mode is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set.
5151
--infer_testing_set) shift; infer_testing_set=$1 ; shift ;;
52-
# [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single".
52+
# [Only for Inference] The text to be synthesized from. It is only used when the inference mode is "single".
5353
--infer_text) shift; infer_text=$1 ; shift ;;
54+
# [Only for Inference] The chosen speaker's voice to be synthesized. It is only used when the inference mode is "single" for multi-speaker VITS.
55+
--infer_speaker_name) shift; infer_speaker_name=$1 ; shift ;;
5456

5557
--) shift ; break ;;
5658
*) echo "Invalid option: $1" exit 1 ;;
@@ -67,7 +69,7 @@ fi
6769
if [ -z "$exp_config" ]; then
6870
exp_config="${exp_dir}"/exp_config.json
6971
fi
70-
echo "Exprimental Configuration File: $exp_config"
72+
echo "Experimental Configuration File: $exp_config"
7173

7274
if [ -z "$gpu" ]; then
7375
gpu="0"
@@ -86,7 +88,7 @@ if [ $running_stage -eq 2 ]; then
8688
echo "[Error] Please specify the experiments name"
8789
exit 1
8890
fi
89-
echo "Exprimental Name: $exp_name"
91+
echo "Experimental Name: $exp_name"
9092

9193
# add default value
9294
if [ -z "$resume_from_ckpt_path" ]; then
@@ -153,6 +155,12 @@ if [ $running_stage -eq 3 ]; then
153155
elif [ "$infer_mode" = "batch" ]; then
154156
infer_text=''
155157
fi
158+
159+
if [ -z "$infer_speaker_name" ]; then
160+
infer_speaker_name=None
161+
fi
162+
163+
156164

157165

158166
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
@@ -163,6 +171,7 @@ if [ $running_stage -eq 3 ]; then
163171
--dataset $infer_dataset \
164172
--testing_set $infer_testing_set \
165173
--text "$infer_text" \
174+
--speaker_name $infer_speaker_name \
166175
--log_level debug
167176

168177

Diff for: models/tts/base/tts_dataset.py

+3
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,9 @@ def __init__(self, cfg, dataset, is_valid=False):
209209
phon_id_collator = phoneIDCollation(cfg, dataset=dataset)
210210
sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq)
211211

212+
if cfg.preprocess.add_blank:
213+
sequence = intersperse(sequence, 0)
214+
212215
self.utt2seq[utt] = sequence
213216

214217
def __getitem__(self, index):

Diff for: models/tts/base/tts_inferece.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from tqdm import tqdm
1313
from accelerate.logging import get_logger
1414
from torch.utils.data import DataLoader
15+
from safetensors.torch import load_file
1516

1617

1718
from abc import abstractmethod
@@ -162,7 +163,16 @@ def _load_model(
162163
ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
163164
checkpoint_path = ls[0]
164165

165-
self.accelerator.load_state(str(checkpoint_path))
166+
if (
167+
Path(os.path.join(checkpoint_path, "model.safetensors")).exists()
168+
and accelerate.__version__ < "0.25"
169+
):
170+
self.model.load_state_dict(
171+
load_file(os.path.join(checkpoint_path, "model.safetensors")),
172+
strict=False,
173+
)
174+
else:
175+
self.accelerator.load_state(str(checkpoint_path))
166176
return str(checkpoint_path)
167177

168178
def inference(self):

0 commit comments

Comments
 (0)