1- import glob
2- import json
31from pathlib import Path
42
53import torch
4+ from datasets import Dataset , DatasetDict , Features , Sequence , Value
65from textless .data .speech_encoder import SpeechEncoder
7- from tokenizers import Tokenizer
8- from tokenizers .models import BPE
9- from tokenizers .trainers import BpeTrainer
106from tqdm import tqdm
117
128from .data import SpeechDataset
13- from .utils import convert_units_to_unicode , shift_unit
14-
15-
16- def tokenize (config ):
17- Path (config .s2u .tokenizer_path ).parent .mkdir (parents = True , exist_ok = True )
18-
19- files = glob .glob (config .dataset .unicode_train + "*" )
20- initial_alphabet = [chr (shift_unit (unit )) for unit in range (config .s2u .vocab_size )]
21- trainer = BpeTrainer (vocab_size = config .model .vocab_size , initial_alphabet = initial_alphabet )
22- tokenizer = Tokenizer (BPE ())
23- tokenizer .train (files = files , trainer = trainer )
24- tokenizer .save (config .s2u .tokenizer_path )
25-
26- Path (config .dataset .train_file ).parent .mkdir (parents = True , exist_ok = True )
27- with open (config .dataset .train_file , "w" ) as f :
28- for file in files :
29- with open (file ) as g :
30- for unicodes in g :
31- unicodes = unicodes .rstrip ()
32- units = tokenizer .encode (unicodes ).ids
33- units = " " .join (str (u ) for u in units )
34-
35- f .write (f"{ units } \n " )
369
3710
3811def tokenize_slm21 (config ):
@@ -63,36 +36,41 @@ def tokenize_slm21(config):
6336 deduplicate = True ,
6437 need_f0 = False ,
6538 ).cuda ()
66- tokenizer = Tokenizer .from_file (config .s2u .tokenizer_path )
6739
68- _tokenize_slm21 ( encoder , tokenizer , config . dataset . swuggy_dev_file , swuggy_dev_loader )
69- _tokenize_slm21 ( encoder , tokenizer , config . dataset . sblimp_dev_file , sblimp_dev_loader )
70- _tokenize_slm21 ( encoder , tokenizer , config . dataset . swuggy_test_file , swuggy_test_loader )
71- _tokenize_slm21 ( encoder , tokenizer , config . dataset . sblimp_test_file , sblimp_test_loader )
40+ swuggy_dev = _tokenize ( encoder , swuggy_dev_loader )
41+ sblimp_dev = _tokenize ( encoder , sblimp_dev_loader )
42+ swuggy_test = _tokenize ( encoder , swuggy_test_loader )
43+ sblimp_test = _tokenize ( encoder , sblimp_test_loader )
7244
45+ swuggy = DatasetDict ({"dev" : swuggy_dev , "test" : swuggy_test })
46+ sblimp = DatasetDict ({"dev" : sblimp_dev , "test" : sblimp_test })
7347
74- def _tokenize_slm21 (
48+ swuggy .push_to_hub (config .dataset .swuggy )
49+ sblimp .push_to_hub (config .dataset .sblimp )
50+
51+
52+ def _tokenize (
7553 encoder : SpeechEncoder ,
76- tokenizer : Tokenizer ,
77- file ,
7854 data_loader : torch .utils .data .DataLoader ,
7955):
80- Path (file ).parent .mkdir (parents = True , exist_ok = True )
81-
82- dataset = dict ()
83-
84- for item in tqdm (data_loader ):
85- outputs = encoder (item ["input_values" ].cuda ())
86- unicodes = convert_units_to_unicode (outputs ["units" ].tolist ())
87- input_ids = tokenizer .encode (unicodes ).ids
56+ features = Features (
57+ {
58+ "id" : Value ("string" ),
59+ "units" : Sequence (Value ("int32" )),
60+ }
61+ )
62+
63+ def generate_dataset ():
64+ for item in tqdm (data_loader ):
65+ outputs = encoder (item ["input_values" ].cuda ())
66+ units = outputs ["units" ].tolist ()
8867
89- dataset [ item ["name" ][0 ]] = input_ids
68+ yield { "id" : item ["name" ][0 ], "units" : units }
9069
91- with open (file , "w" ) as f :
92- json .dump (dataset , f )
70+ return Dataset .from_generator (generate_dataset , features = features )
9371
9472
95- def encode (config , spk_ids : str = "1-9" ):
73+ def tokenize_trainset (config , spk_ids : str = "1-9" ):
9674 wav_dir_train = Path (config .dataset .wav_dir_train )
9775 train_paths = wav_dir_train .glob (f"*/[{ spk_ids } ]*/**/*" + config .dataset .ext_audio )
9876 train_set = SpeechDataset (train_paths )
@@ -106,15 +84,5 @@ def encode(config, spk_ids: str = "1-9"):
10684 need_f0 = False ,
10785 ).cuda ()
10886
109- _encode (encoder , config .dataset .unicode_train + f"{ spk_ids } " , train_loader )
110-
111-
112- def _encode (encoder : SpeechEncoder , file , data_loader : torch .utils .data .DataLoader ):
113- Path (file ).parent .mkdir (parents = True , exist_ok = True )
114- with open (file , "w" ) as f :
115- for item in tqdm (data_loader ):
116- outputs = encoder (item ["input_values" ].cuda ())
117-
118- unicodes = convert_units_to_unicode (outputs ["units" ].tolist ())
119-
120- f .write (f"{ unicodes } \n " )
87+ trainset = _tokenize (encoder , train_loader )
88+ trainset .push_to_hub (config .dataset .train , split = f"train{ spk_ids } " )
0 commit comments