add duration prediction

ryota-komatsu · ryota-komatsu · commit a3b727199a89 · 2025-06-01T19:13:42.000+09:00
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
 
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Python](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org)
+[![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ryota-komatsu/speech_resynth/blob/main/demo.ipynb)
 [![model](https://img.shields.io/badge/%F0%9F%A4%97-Models-blue)](https://huggingface.co/ryota-komatsu/flow_matching_with_bigvgan)
 [![dataset](https://img.shields.io/badge/%F0%9F%A4%97-Datasets-blue)](https://huggingface.co/datasets/ryota-komatsu/LibriTTS-R-whisper-large-v3-4096units)
 
@@ -48,7 +49,7 @@ input_features = feature_extractor(
 ).input_features.to("cuda")
 
 # encode a waveform into pseudo-phonetic units
-units = encoder(input_features, out_layer=15)
+units = encoder.encode(input_features)
 units = units.unsqueeze(0) + 1  # 0: pad
 
 # resynthesis
@@ -90,7 +91,7 @@ input_features = feature_extractor(
 ).input_features.to("cuda")
 
 # encode a waveform into pseudo-phonetic units
-units = encoder(input_features, out_layer=15).tolist()
+units = encoder.encode(input_features).tolist()
 unicodes = convert_units_to_unicode(units)
 
 # BPE
@@ -105,7 +106,7 @@ logits = model(input_ids=input_ids).logits
 
 Visit [demo page](https://ryota-komatsu.github.io/speech_resynth) for speech samples.
 
-Jupyter notebook demo is found [here](demo.ipynb).
+Google colab demo is found [here](https://colab.research.google.com/github/ryota-komatsu/speech_resynth/blob/main/demo.ipynb).
 
 ## Data Preparation
 
@@ -136,6 +137,10 @@ To run only a specific stage, pass it as an argument.
 
 Supported processing stages
 1. resample
+1. extract_features  # can be skipped when using a pretrained BigVGan
+1. train_bigvgan  # can be skipped when using a pretrained BigVGan
+1. train_tokenizer  # can be skipped when using a pretrained model
+1. tokenize_dataset  # can be skipped when using a Hugging Face datasets
 1. train_flow_matching
 1. synthesize
 
diff --git a/configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml b/configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml
@@ -5,6 +5,7 @@ dataset:
   name: "ryota-komatsu/LibriTTS-R-whisper-large-v3-4096units"  # https://huggingface.co/datasets
   wav_dir: "data/LibriTTS_R_16k" # ${root}/train-clean-100, train-clean-360, ...
   wav_dir_orig: "data/LibriTTS_R"  # if wav_dir == wav_dir_orig, original wav files are overwritten with 16 kHz waveforms
+  spectrogram_dir: "data/LibriTTS_R_16k/spectrogram"  # 34GB for BigVGAN
   vad: false
 
   ext_audio: ".wav"
@@ -16,9 +17,6 @@ synthesis:
   split: "test-*"
   ext_audio: ${dataset.ext_audio}
 
-eval:
-  result_path: "results/resynth/score.csv"
-
 flow_matching:
   path: "models/flow_matching"
   batch_size: 2700 # work with single 24GB VRAM GPU
@@ -36,9 +34,8 @@ flow_matching:
   dt: 0.1
   truncation_value: null  # truncation trick (https://arxiv.org/abs/1809.11096)
 
-  vocab_size: ${tokenizer.vocab_size}
-
   # src.flow_matching.configs.ConditionalFlowMatchingConfig
+  vocab_size: ${tokenizer.vocab_size}
   dim_in: 80
   dim_cond_emb: 1280
   hidden_size: 256
@@ -105,9 +102,10 @@ vocoder:
   validation_interval: 10000
 
 tokenizer:
+  base: "openai/whisper-large-v3"
   name: "ryota-komatsu/whisper-large-v3-tokenizer"
   vocab_size: 4096
-  out_layer: 15
+  encoder_layers: 16
 
 flow_matching_with_vocoder:
   name: "ryota-komatsu/flow_matching_with_bigvgan"
diff --git a/demo.ipynb b/demo.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Textless Speech Resynthesis using Conditional Flow Matching and HuBERT units"
+    "# Speech Resynthesis Using Conditional Flow Matching and Whisper Units"
    ]
   },
   {
@@ -13,7 +13,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install -r requirements.txt"
+    "!pip install datasets==3.6.0 \\\n",
+    "    gcsfs==2025.3.0 \\\n",
+    "    nvidia-cublas-cu12==12.4.5.8 \\\n",
+    "    nvidia-cuda-cupti-cu12==12.4.127 \\\n",
+    "    nvidia-cuda-nvrtc-cu12==12.4.127 \\\n",
+    "    nvidia-cuda-runtime-cu12==12.4.127 \\\n",
+    "    nvidia-cudnn-cu12==9.1.0.70 \\\n",
+    "    nvidia-cufft-cu12==11.2.1.3 \\\n",
+    "    nvidia-curand-cu12==10.3.5.147 \\\n",
+    "    nvidia-cusolver-cu12==11.6.1.9 \\\n",
+    "    nvidia-cusparse-cu12==12.3.1.170 \\\n",
+    "    nvidia-nvjitlink-cu12==12.4.127 \\\n",
+    "    einx"
    ]
   },
   {
@@ -111,7 +123,7 @@
     "    padding=\"do_not_pad\",\n",
     ").input_features.to(\"cuda\")\n",
     "\n",
-    "units = encoder(input_features, out_layer=15)\n",
+    "units = encoder.encode(input_features)\n",
     "units = units.unsqueeze(0) + 1  # 0: pad"
    ]
   },
diff --git a/main_resynth.py b/main_resynth.py
@@ -2,16 +2,29 @@
 from omegaconf import OmegaConf
 
 from src.bigvgan.train import train_bigvgan
-from src.flow_matching.preprocess import resample
+from src.flow_matching.preprocess import extract_features, resample
 from src.flow_matching.synthesize import synthesize
 from src.flow_matching.train import train_flow_matching
+from src.flow_matching.utils.whisper import tokenize_dataset, train_tokenizer
 
 
 class TaskRunner:
     def resample(self, config: str = "configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml"):
         config = OmegaConf.load(config)
         resample(config)
 
+    def extract_features(self, config: str = "configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml"):
+        config = OmegaConf.load(config)
+        extract_features(config)
+
+    def train_tokenizer(self, config: str = "configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml"):
+        config = OmegaConf.load(config)
+        train_tokenizer(config)
+
+    def tokenize_dataset(self, config: str = "configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml"):
+        config = OmegaConf.load(config)
+        tokenize_dataset(config)
+
     def train_bigvgan(self, config: str = "configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml"):
         config = OmegaConf.load(config)
         train_bigvgan(config)
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
 accelerate==1.6.0
-backoff
 datasets
 einops==0.8.1
 einx==0.3.0
@@ -11,11 +10,9 @@ jiwer @ git+https://github.com/jitsi/jiwer.git@c1b0d5e005431f5ce4fa6797f48639a8c
 lightning==2.5.1
 matplotlib==3.8.4
 numpy==1.22.0
-peft
 scikit-learn==1.4.2
 tensorboard==2.17.0
 torch==2.5.1+cu121
 torchaudio==2.5.1+cu121
-torchvision==0.20.1+cu121
 transformers==4.51.2
 zerospeech-benchmarks==0.9.4
diff --git a/src/flow_matching/configs.py b/src/flow_matching/configs.py
@@ -8,13 +8,13 @@
 class ConditionalFlowMatchingConfig(PretrainedConfig):
     def __init__(
         self,
-        vocab_size: int = 2000,
+        vocab_size: int = 4096,
         dim_in: int = 80,
         dim_cond_emb: int = 768,
         hidden_size: int = 256,
         depth: int = 4,
         heads: int = 2,
-        intermediate_size: int = 896,
+        intermediate_size: int = 768,
         attn_dropout: float = 0.0,
         ff_dropout: float = 0.0,
         use_unet_skip_connection: bool = False,
diff --git a/src/flow_matching/data.py b/src/flow_matching/data.py
@@ -109,7 +109,11 @@ def get_collate_fn(
     wav_dir: Optional[str] = None,
     frames_per_seg: Optional[int] = None,
     ext_audio: str = ".wav",
+    predict_duration: bool = False,
 ):
+    if predict_duration:
+        assert frames_per_seg is None
+
     def parse_item(item: Dict[str, Any]):
         input_ids = item["units"] + 1  # 0: pad
         spectrogram_labels = item["spectrogram"]
@@ -122,6 +126,11 @@ def parse_item(item: Dict[str, Any]):
             wav, sr = torchaudio.load(wav_path)
             wav = wav.squeeze(0)
 
+        if predict_duration:
+            input_ids, durations = torch.unique_consecutive(input_ids, return_counts=True)
+        else:
+            durations = torch.ones_like(input_ids)
+
         if frames_per_seg is not None:
             diff = len(input_ids) - frames_per_seg
 
@@ -130,30 +139,34 @@ def parse_item(item: Dict[str, Any]):
                 input_ids = input_ids[start : start + frames_per_seg]
                 spectrogram_labels = spectrogram_labels[start : start + frames_per_seg]
 
-        return input_ids, spectrogram_labels, transcript, id, wav
+        return input_ids, spectrogram_labels, durations, transcript, id, wav
 
     def collate_fn(batch):
         input_ids = []
         spectrogram_labels = []
+        duration_labels = []
         transcripts = []
         names = []
         input_values = []
 
         for item in batch:
-            units, spectrogram, transcript, id, wav = parse_item(item)
+            units, spectrogram, durations, transcript, id, wav = parse_item(item)
             input_ids.append(units)
             spectrogram_labels.append(spectrogram)
+            duration_labels.append(durations)
             transcripts.append(transcript)
             names.append(id)
             input_values.append(wav)
 
         input_ids = pad_sequence(input_ids, batch_first=True)
         spectrogram_labels = pad_sequence(spectrogram_labels, batch_first=True, padding_value=-100)
+        duration_labels = pad_sequence(duration_labels, batch_first=True)
         input_values = pad_sequence(input_values, batch_first=True)
 
         return {
             "input_ids": input_ids,
             "spectrogram_labels": spectrogram_labels,
+            "duration_labels": duration_labels,
             "transcripts": transcripts,
             "names": names,
             "input_values": input_values,
diff --git a/src/flow_matching/preprocess.py b/src/flow_matching/preprocess.py
@@ -5,6 +5,8 @@
 import torchaudio
 from tqdm import tqdm
 
+from ..bigvgan.data import mel_spectrogram
+
 
 def resample(config):
     wav_dir_orig = Path(config.dataset.wav_dir_orig)
@@ -27,3 +29,27 @@ def resample(config):
         wav_path.parent.mkdir(parents=True, exist_ok=True)
         wav_path = str(wav_path)  # for sox backend
         torchaudio.save(wav_path, wav, 16000)
+
+
+def extract_features(config):
+    wav_dir = Path(config.dataset.wav_dir)
+    spectrogram_dir = Path(config.dataset.spectrogram_dir)
+    wav_paths = list(wav_dir.glob("**/*" + config.dataset.ext_audio))
+
+    for wav_path in tqdm(wav_paths):
+        wav_name = wav_path.relative_to(wav_dir).with_suffix("")
+        spectrogram_path = spectrogram_dir / wav_name.with_suffix(".pt")
+        if spectrogram_path.is_file():
+            continue
+        spectrogram_path.parent.mkdir(parents=True, exist_ok=True)
+
+        wav_path = str(wav_path)
+        wav, sr = torchaudio.load(wav_path)
+        wav = wav.cuda()
+        wav = wav / wav.abs().max() * 0.95
+
+        spectrogram_labels = mel_spectrogram(wav)  # (1, 80, len)
+        spectrogram_labels = spectrogram_labels.transpose(1, 2)  # (1, len, 80)
+        spectrogram_labels = spectrogram_labels.cpu()
+
+        torch.save(spectrogram_labels, spectrogram_path)
diff --git a/src/flow_matching/synthesize.py b/src/flow_matching/synthesize.py
@@ -39,7 +39,7 @@ def synthesize(config):
                 device="cuda",
                 padding="do_not_pad",
             ).input_features.to("cuda")
-            units = encoder(input_features, out_layer=config.tokenizer.out_layer)
+            units = encoder.encode(input_features)
             units = units + 1  # 0: pad
             input_ids.append(units)
 
diff --git a/src/flow_matching/train.py b/src/flow_matching/train.py
@@ -112,8 +112,9 @@ def validate(config, dataloader, model: ConditionalFlowMatchingModel, step: int,
 def train_flow_matching(config):
     fix_random_seed(config.common.seed)
 
-    train_set = load_dataset(config.dataset.name, split="train").with_format("torch")
-    dev_set = load_dataset(config.dataset.name, split="dev").with_format("torch")
+    train_set = load_dataset(config.dataset.name, split="train", keep_in_memory=True).with_format("torch")
+    dev_set = load_dataset(config.dataset.name, split="dev", keep_in_memory=True).with_format("torch")
+
     train_loader = torch.utils.data.DataLoader(
         train_set,
         batch_size=config.flow_matching.batch_size,
@@ -122,6 +123,7 @@ def train_flow_matching(config):
         collate_fn=get_collate_fn(
             frames_per_seg=config.flow_matching.frames_per_seg,
             ext_audio=config.dataset.ext_audio,
+            predict_duration=config.flow_matching.predict_duration,
         ),
     )
     dev_loader = torch.utils.data.DataLoader(
@@ -131,6 +133,7 @@ def train_flow_matching(config):
             wav_dir=config.dataset.wav_dir,
             frames_per_seg=config.flow_matching.frames_per_seg,
             ext_audio=config.dataset.ext_audio,
+            predict_duration=config.flow_matching.predict_duration,
         ),
     )
 
@@ -189,6 +192,7 @@ def train_flow_matching(config):
                 loss = model(
                     input_ids=batch["input_ids"].cuda(),
                     spectrogram_labels=batch["spectrogram_labels"].cuda(),
+                    duration_labels=batch["duration_labels"].cuda(),
                 )
             scaler.scale(loss).backward()
 
diff --git a/src/flow_matching/utils/whisper.py b/src/flow_matching/utils/whisper.py
diff --git a/src/speechlm/tokenize.py b/src/speechlm/tokenize.py