ryota-komatsu
diff --git a/‎README.md‎
Lines changed: 22 additions & 26 deletions b/‎README.md‎
Lines changed: 22 additions & 26 deletions
diff --git a/‎configs/unit2speech/mhubert-expresso-2000-duration-prediction.yaml‎
Lines changed: 0 additions & 102 deletions b/‎configs/unit2speech/mhubert-expresso-2000-duration-prediction.yaml‎
Lines changed: 0 additions & 102 deletions
diff --git a/‎configs/unit2speech/mhubert-expresso-2000.yaml‎
Lines changed: 0 additions & 102 deletions b/‎configs/unit2speech/mhubert-expresso-2000.yaml‎
Lines changed: 0 additions & 102 deletions
diff --git a/‎…peech/mhubert-expresso-2000-bigvgan.yaml‎ ‎…peech/whisper-large-v3-4096-bigvgan.yaml‎configs/unit2speech/mhubert-expresso-2000-bigvgan.yaml renamed to configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml
Lines changed: 11 additions & 15 deletions b/‎…peech/mhubert-expresso-2000-bigvgan.yaml‎ ‎…peech/whisper-large-v3-4096-bigvgan.yaml‎configs/unit2speech/mhubert-expresso-2000-bigvgan.yaml renamed to configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml
Lines changed: 11 additions & 15 deletions
@@ -1,44 +1,36 @@
 # Speech Resynthesis and Language Modeling with Flow Matching and Llama
 
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Python](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org)
+[![Python](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org)
 [![model](https://img.shields.io/badge/%F0%9F%A4%97-Models-blue)](https://huggingface.co/ryota-komatsu/flow_matching_with_bigvgan)
-[![dataset](https://img.shields.io/badge/%F0%9F%A4%97-Datasets-blue)](https://huggingface.co/datasets/ryota-komatsu/libritts-r-mhubert-2000units)
+[![dataset](https://img.shields.io/badge/%F0%9F%A4%97-Datasets-blue)](https://huggingface.co/datasets/ryota-komatsu/LibriTTS-R-whisper-large-v3-4096units)
 
 ## Setup
 
 ```shell
 sudo apt install git-lfs  # for UTMOS
 
-conda create -y -n py39 python=3.9.21 pip=24.0
-conda activate py39
-pip install -r requirements/requirements.txt
+conda create -y -n py310 -c pytorch -c nvidia -c conda-forge python=3.10.17 pip=24.0 faiss-gpu=1.10.0
+conda activate py310
+pip install -r requirements.txt
 pip install flash-attn --no-build-isolation  # optional
 
-sh scripts/setup.sh  # download textlesslib and UTMOS
-
-cd src/textlesslib
-pip install -e .
-cd -
+sh scripts/setup.sh  # download UTMOS
 ```
 
-## Usage: sampling multi-speaker speech from self-supervised discrete units
+## Usage: sampling multi-speaker speech from supervised discrete units
 
 ```python
 import torchaudio
-from textless.data.speech_encoder import SpeechEncoder
 
 from src.flow_matching.models import ConditionalFlowMatchingWithBigVGan
+from src.flow_matching.utils.whisper import WhisperFeatureExtractor, WhisperEncoder
 
 wav_path = "/path/to/wav"
 
-encoder = SpeechEncoder.by_name(
-    dense_model_name="mhubert-base-vp_mls_cv_8lang",
-    quantizer_model_name="kmeans-expresso",
-    vocab_size=2000,
-    deduplicate=False,
-    need_f0=False,
-).cuda()
+# load model and processor
+feature_extractor = WhisperFeatureExtractor.from_pretrained("ryota-komatsu/whisper-large-v3-tokenizer")
+encoder = WhisperEncoder.from_pretrained("ryota-komatsu/whisper-large-v3-tokenizer").cuda()
 
 # download a pretrained model from hugging face hub
 decoder = ConditionalFlowMatchingWithBigVGan.from_pretrained("ryota-komatsu/flow_matching_with_bigvgan").cuda()
@@ -47,8 +39,16 @@ decoder = ConditionalFlowMatchingWithBigVGan.from_pretrained("ryota-komatsu/flow
 waveform, sr = torchaudio.load(wav_path)
 waveform = torchaudio.functional.resample(waveform, sr, 16000)
 
+input_features = feature_extractor(
+    waveform.squeeze(0).numpy(),
+    return_tensors="pt",
+    sampling_rate=16000,
+    device="cuda",
+    padding="do_not_pad",
+).input_features.to("cuda")
+
 # encode a waveform into pseudo-phonetic units
-units = encoder(waveform.cuda())["units"]
+units = encoder(input_features, out_layer=15)
 units = units.unsqueeze(0) + 1  # 0: pad
 
 # resynthesis
@@ -105,7 +105,7 @@ Jupyter notebook demo is found [here](demo.ipynb).
 
 ## Data Preparation
 
-If you already have LibriTTS-R, you can use it by editing [a config file](configs/unit2speech/mhubert-expresso-2000.yaml#L6);
+If you already have LibriTTS-R, you can use it by editing [a config file](configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml#L7);
 ```yaml
 dataset:
   wav_dir_orig: "/path/to/LibriTTS-R" # ${dataset.wav_dir_orig}/train-clean-100, train-clean-360, ...
@@ -129,18 +129,14 @@ sh scripts/download_slm21.sh  # download sWUGGY and sBLIMP
 ## Training a unit-to-speech synthesizer
 
 ```shell
-python main_resynth.py --config=configs/unit2speech/mhubert-expresso-2000.yaml
+python main_resynth.py --config=configs/unit2speech/whisper-large-v3-4096-bigvgan.yaml
 ```
 
 To run only a specific stage, pass it as an argument.
 
 Supported processing stages
 1. resample
-1. tokenize
-1. extract_features
-1. train_bigvgan  # can be skipped when using a pretrained model
 1. train_flow_matching
-1. evaluate
 1. synthesize
 
 ```shell
 
@@ -2,20 +2,14 @@ common:
   seed: 0
 
 dataset:
+  name: "ryota-komatsu/LibriTTS-R-whisper-large-v3-4096units"  # https://huggingface.co/datasets
   wav_dir: "data/LibriTTS_R_16k" # ${root}/train-clean-100, train-clean-360, ...
   wav_dir_orig: "data/LibriTTS_R"  # if wav_dir == wav_dir_orig, original wav files are overwritten with 16 kHz waveforms
-  spectrogram_dir: "data/LibriTTS_R_16k/spectrogram"  # 34GB
   vad: false
 
   ext_audio: ".wav"
   ext_txt: ".normalized.txt"
 
-  # json file format
-  # "name": {"units": List[int], "durations": List[int], "transcript": str}
-  train_file: "data/resynth/train.json"  # 354,729 samples
-  dev_file: "data/resynth/dev.json"  # 5,736 samples
-  test_file: "data/resynth/test.json"  # 4,837 samples
-
 synthesis:
   src_dir: ${dataset.wav_dir}
   tgt_dir: ${dataset.wav_dir}_resynth
@@ -39,17 +33,14 @@ flow_matching:
   save_interval_epoch: 20
 
   # inference
-  dt: 0.0625
-  truncation_value: 1.0  # truncation trick (https://arxiv.org/abs/1809.11096)
+  dt: 0.1
+  truncation_value: null  # truncation trick (https://arxiv.org/abs/1809.11096)
 
-  # textless.data.speech_encoder.SpeechEncoder
-  dense_model_name: "mhubert-base-vp_mls_cv_8lang"
-  quantizer_model_name: "kmeans-expresso"
-  vocab_size: 2000
+  vocab_size: ${tokenizer.vocab_size}
 
   # src.flow_matching.configs.ConditionalFlowMatchingConfig
   dim_in: 80
-  dim_cond_emb: 768
+  dim_cond_emb: 1280
   hidden_size: 256
   depth: 4
   heads: 2
@@ -113,9 +104,14 @@ vocoder:
   checkpoint_interval: 10000
   validation_interval: 10000
 
+tokenizer:
+  name: "ryota-komatsu/whisper-large-v3-tokenizer"
+  vocab_size: 4096
+  out_layer: 15
+
 flow_matching_with_vocoder:
   name: "ryota-komatsu/flow_matching_with_bigvgan"
   batch_size: 8
 
 asr:
-  name: "microsoft/Phi-4-multimodal-instruct"
+  name: "openai/whisper-large-v3"