Skip to content

Commit 1ebed0c

Browse files
authored
Release Vevo1.5 and Training Code of Vevo (#426)
1 parent f25ba32 commit 1ebed0c

File tree

69 files changed

+9598
-344
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+9598
-344
lines changed

Diff for: README.md

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
In addition to the specific generation tasks, Amphion includes several **vocoders** and **evaluation metrics**. A vocoder is an important module for producing high-quality audio signals, while evaluation metrics are critical for ensuring consistent metrics in generation tasks. Moreover, Amphion is dedicated to advancing audio generation in real-world applications, such as building **large-scale datasets** for speech synthesis.
3535

3636
## 🚀 News
37+
- **2025/04/12**: We release [***Vevo1.5***](models/svc/vevosing/README.md), which extends Vevo and focuses on unified and controllable generation for both speech and singing voice. Vevo1.5 can be applied into a series of speech and singing voice generation tasks, including VC, TTS, AC, SVS, SVC, Speech/Singing Voice Editing, and more. [![blog](https://img.shields.io/badge/README-Blog-blue.svg)](https://veiled-army-9c5.notion.site/Vevo1-5-1d2ce17b49a280b5b444d3fa2300c93a)
3738
- **2025/02/26**: We release [***Metis***](https://github.com/open-mmlab/Amphion/tree/main/models/tts/metis), a foundation model for unified speech generation. The system supports zero-shot text-to-speech, voice conversion, target speaker extraction, speech enhancement, and lip-to-speech. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/pdf/2502.03128) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/amphion/metis)
3839
- **2025/02/26**: *The Emilia-Large dataset, featuring over 200,000 hours of data, is now available!!!* Emilia-Large combines the original 101k-hour Emilia dataset (licensed under `CC BY-NC 4.0`) with the brand-new 114k-hour **Emilia-YODAS dataset** (licensed under `CC BY 4.0`). Download at [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Dataset-yellow)](https://huggingface.co/datasets/amphion/Emilia-Dataset). Check details at [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2501.15907).
3940
- **2025/01/30**: We release [Amphion v0.2 Technical Report](https://arxiv.org/abs/2501.15442), which provides a comprehensive overview of the Amphion updates in 2024. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2501.15442)

Diff for: bins/codec/train.py

+9
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,19 @@
88
import torch
99

1010
from models.codec.facodec.facodec_trainer import FAcodecTrainer
11+
from models.codec.vevo.vqvae_trainer import (
12+
VQVAETrainer,
13+
)
14+
from models.codec.coco.rep_coco_trainer import RepCocoTrainer
1115

1216
from utils.util import load_config
1317

1418

1519
def build_trainer(args, cfg):
1620
supported_trainer = {
1721
"FAcodec": FAcodecTrainer,
22+
"RepCoco": RepCocoTrainer,
23+
"VQVAE": VQVAETrainer,
1824
}
1925

2026
trainer_class = supported_trainer[cfg.model_type]
@@ -50,6 +56,9 @@ def main():
5056
help="A specific name to note the experiment",
5157
required=True,
5258
)
59+
parser.add_argument(
60+
"--resume", action="store_true", help="The model name to restore"
61+
)
5362
parser.add_argument(
5463
"--resume_type",
5564
type=str,

Diff for: bins/svc/train.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
1212
from models.svc.transformer.transformer_trainer import TransformerTrainer
1313
from models.svc.vits.vits_trainer import VitsSVCTrainer
14+
from models.svc.flow_matching_transformer.fmt_trainer import (
15+
FlowMatchingTransformerTrainer,
16+
)
17+
from models.svc.autoregressive_transformer.ar_trainer import (
18+
AutoregressiveTransformerTrainer,
19+
)
20+
1421
from utils.util import load_config
1522

1623

@@ -20,6 +27,8 @@ def build_trainer(args, cfg):
2027
"DiffComoSVC": ComoSVCTrainer,
2128
"TransformerSVC": TransformerTrainer,
2229
"VitsSVC": VitsSVCTrainer,
30+
"AutoregressiveTransformer": AutoregressiveTransformerTrainer,
31+
"FlowMatchingTransformer": FlowMatchingTransformerTrainer,
2332
}
2433

2534
trainer_class = supported_trainer[cfg.model_type]
@@ -79,24 +88,33 @@ def main():
7988
cfg = load_config(args.config)
8089

8190
# Data Augmentation
82-
if (
83-
type(cfg.preprocess.data_augment) == list
84-
and len(cfg.preprocess.data_augment) > 0
85-
):
86-
new_datasets_list = []
87-
for dataset in cfg.preprocess.data_augment:
88-
new_datasets = [
89-
f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
90-
(
91-
f"{dataset}_formant_shift"
92-
if cfg.preprocess.use_formant_shift
93-
else None
94-
),
95-
f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
96-
f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
97-
]
98-
new_datasets_list.extend(filter(None, new_datasets))
99-
cfg.dataset.extend(new_datasets_list)
91+
if "data_augment" in cfg.preprocess:
92+
if (
93+
type(cfg.preprocess.data_augment) == list
94+
and len(cfg.preprocess.data_augment) > 0
95+
):
96+
new_datasets_list = []
97+
for dataset in cfg.preprocess.data_augment:
98+
new_datasets = [
99+
(
100+
f"{dataset}_pitch_shift"
101+
if cfg.preprocess.use_pitch_shift
102+
else None
103+
),
104+
(
105+
f"{dataset}_formant_shift"
106+
if cfg.preprocess.use_formant_shift
107+
else None
108+
),
109+
f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
110+
(
111+
f"{dataset}_time_stretch"
112+
if cfg.preprocess.use_time_stretch
113+
else None
114+
),
115+
]
116+
new_datasets_list.extend(filter(None, new_datasets))
117+
cfg.dataset.extend(new_datasets_list)
100118

101119
# CUDA settings
102120
cuda_relevant()

Diff for: bins/vc/train.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright (c) 2023 Amphion.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import argparse
7+
import torch
8+
9+
from models.vc.flow_matching_transformer.fmt_trainer import (
10+
FlowMatchingTransformerTrainer,
11+
)
12+
from models.vc.autoregressive_transformer.ar_trainer import (
13+
AutoregressiveTransformerTrainer,
14+
)
15+
16+
from utils.util import load_config
17+
18+
19+
def build_trainer(args, cfg):
20+
supported_trainer = {
21+
"FlowMatchingTransformer": FlowMatchingTransformerTrainer,
22+
"AutoregressiveTransformer": AutoregressiveTransformerTrainer,
23+
}
24+
25+
trainer_class = supported_trainer[cfg.model_type]
26+
trainer = trainer_class(args, cfg)
27+
return trainer
28+
29+
30+
def cuda_relevant(deterministic=False):
31+
torch.cuda.empty_cache()
32+
# TF32 on Ampere and above
33+
torch.backends.cuda.matmul.allow_tf32 = True
34+
torch.backends.cudnn.enabled = True
35+
torch.backends.cudnn.allow_tf32 = True
36+
# Deterministic
37+
torch.backends.cudnn.deterministic = deterministic
38+
torch.backends.cudnn.benchmark = not deterministic
39+
torch.use_deterministic_algorithms(deterministic)
40+
41+
42+
def main():
43+
parser = argparse.ArgumentParser()
44+
parser.add_argument(
45+
"--config",
46+
default="config.json",
47+
help="json files for configurations.",
48+
required=True,
49+
)
50+
parser.add_argument(
51+
"--exp_name",
52+
type=str,
53+
default="exp_name",
54+
help="A specific name to note the experiment",
55+
required=True,
56+
)
57+
parser.add_argument(
58+
"--resume", action="store_true", help="The model name to restore"
59+
)
60+
parser.add_argument(
61+
"--log_level", default="warning", help="logging level (debug, info, warning)"
62+
)
63+
parser.add_argument(
64+
"--resume_type",
65+
type=str,
66+
default="resume",
67+
help="Resume training or finetuning.",
68+
)
69+
parser.add_argument(
70+
"--checkpoint_path",
71+
type=str,
72+
default=None,
73+
help="Checkpoint for resume training or finetuning.",
74+
)
75+
parser.add_argument(
76+
"--dataloader_seed",
77+
type=int,
78+
default=1,
79+
help="Seed for dataloader",
80+
)
81+
82+
args = parser.parse_args()
83+
cfg = load_config(args.config)
84+
85+
# # CUDA settings
86+
cuda_relevant()
87+
88+
# Build trainer
89+
trainer = build_trainer(args, cfg)
90+
torch.set_num_threads(1)
91+
torch.set_num_interop_threads(1)
92+
trainer.train_loop()
93+
94+
95+
if __name__ == "__main__":
96+
main()

Diff for: bins/vocoder/train.py

+19-11
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
1111
from models.vocoders.diffusion.diffusion_vocoder_trainer import DiffusionVocoderTrainer
12+
from models.vocoders.vocos.vocos_trainer import VocosTrainer
1213

1314
from utils.util import load_config
1415

@@ -17,6 +18,7 @@ def build_trainer(args, cfg):
1718
supported_trainer = {
1819
"GANVocoder": GANVocoderTrainer,
1920
"DiffusionVocoder": DiffusionVocoderTrainer,
21+
"Vocos": VocosTrainer,
2022
}
2123

2224
trainer_class = supported_trainer[cfg.model_type]
@@ -51,6 +53,11 @@ def main():
5153
help="A specific name to note the experiment",
5254
required=True,
5355
)
56+
parser.add_argument(
57+
"--resume",
58+
action="store_true",
59+
help="If specified, to resume from the existing checkpoint.",
60+
)
5461
parser.add_argument(
5562
"--resume_type",
5663
type=str,
@@ -68,17 +75,18 @@ def main():
6875
cfg = load_config(args.config)
6976

7077
# Data Augmentation
71-
if cfg.preprocess.data_augment:
72-
new_datasets_list = []
73-
for dataset in cfg.preprocess.data_augment:
74-
new_datasets = [
75-
# f"{dataset}_pitch_shift",
76-
# f"{dataset}_formant_shift",
77-
f"{dataset}_equalizer",
78-
f"{dataset}_time_stretch",
79-
]
80-
new_datasets_list.extend(new_datasets)
81-
cfg.dataset.extend(new_datasets_list)
78+
if "data_augment" in cfg.preprocess:
79+
if cfg.preprocess.data_augment:
80+
new_datasets_list = []
81+
for dataset in cfg.preprocess.data_augment:
82+
new_datasets = [
83+
# f"{dataset}_pitch_shift",
84+
# f"{dataset}_formant_shift",
85+
f"{dataset}_equalizer",
86+
f"{dataset}_time_stretch",
87+
]
88+
new_datasets_list.extend(new_datasets)
89+
cfg.dataset.extend(new_datasets_list)
8290

8391
# CUDA settings
8492
cuda_relevant()

Diff for: egs/codec/coco/contentstyle_fvq16384_12.5hz.json

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
{
2+
"model_type": "RepCoco",
3+
"dataset": {
4+
"emilia": 1, // 101k hours, 34m samples
5+
"singnet": 20 // 400 hours, 0.34m samples * 20 = 6.8m samples
6+
},
7+
"singnet_path": "[Please fill out your singing data path]/sing400.json",
8+
"preprocess": {
9+
"hop_size": 480,
10+
"sample_rate": 24000,
11+
"n_fft": 1920,
12+
"num_mels": 128,
13+
"win_size": 1920,
14+
"fmin": 0,
15+
"fmax": 12000,
16+
"mel_var": 8.14,
17+
"mel_mean": -4.92,
18+
"f0_fmin": 50.0,
19+
"f0_fmax": 1100.0,
20+
"load_chromagram": true
21+
},
22+
"model": {
23+
"coco": {
24+
"coco_type": "content_style", // content, style, or content_style
25+
"downsample_rate": 4, // The original frame rate is 50 Hz, downsample to 12.5 Hz
26+
"codebook_size": 16384,
27+
"hidden_size": 1024, // Representations Dim
28+
"codebook_dim": 8,
29+
"encoder": {
30+
"vocos_dim": 384,
31+
"vocos_intermediate_dim": 2048,
32+
"vocos_num_layers": 12,
33+
},
34+
"decoder": {
35+
"vocos_dim": 384,
36+
"vocos_intermediate_dim": 2048,
37+
"vocos_num_layers": 12,
38+
},
39+
"use_normed_whisper": true,
40+
"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt",
41+
"whisper_dim": 1024,
42+
"chromagram_dim": 24
43+
},
44+
"cond_sample_rate": 16000
45+
},
46+
"log_dir": "ckpts/coco",
47+
"train": {
48+
"max_epoch": 0,
49+
"use_dynamic_batchsize": true,
50+
"max_tokens": 18000,
51+
"max_sentences": 90,
52+
"lr_warmup_steps": 10000,
53+
"lr_scheduler": "constant",
54+
"num_train_steps": 1000000,
55+
"adam": {
56+
"lr": 1e-4,
57+
"betas": [
58+
0.5,
59+
0.9
60+
]
61+
},
62+
"ddp": false,
63+
"random_seed": 114,
64+
"batch_size": 32, // use batch_size if not use dynamic batchsize
65+
"epochs": 5000,
66+
"max_steps": 1000000,
67+
"total_training_steps": 800000,
68+
"save_summary_steps": 500,
69+
"save_checkpoints_steps": 1000,
70+
"save_checkpoints_backup_steps": 100000,
71+
"valid_interval": 2000,
72+
"keep_checkpoint_max": 100,
73+
"gradient_accumulation_step": 1,
74+
"tracker": [
75+
"tensorboard"
76+
],
77+
"save_checkpoint_stride": [
78+
1
79+
],
80+
"keep_last": [
81+
5
82+
],
83+
"run_eval": [
84+
true
85+
],
86+
"dataloader": {
87+
"num_worker": 8,
88+
"pin_memory": true
89+
},
90+
"use_emilia_dataset": true
91+
}
92+
}

Diff for: egs/codec/coco/contentstyle_fvq16384_12.5hz.sh

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
######## Build Experiment Environment ###########
2+
exp_dir=$(cd `dirname $0`; pwd)
3+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
4+
5+
export WORK_DIR=$work_dir
6+
export PYTHONPATH=$work_dir
7+
export PYTHONIOENCODING=UTF-8
8+
9+
######## Set Experiment Configuration ###########
10+
exp_config="$exp_dir/contentstyle_fvq16384_12.5hz.json"
11+
exp_name="contentstyle_fvq16384_12.5hz"
12+
13+
####### Train Model ###########
14+
CUDA_VISIBLE_DEVICES="0" accelerate launch --main_process_port 14557 --mixed_precision="bf16" \
15+
"${work_dir}"/bins/codec/train.py \
16+
--config=$exp_config \
17+
--exp_name=$exp_name \
18+
--log_level debug

0 commit comments

Comments
 (0)