-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_data.py
More file actions
147 lines (120 loc) · 5.43 KB
/
prepare_data.py
File metadata and controls
147 lines (120 loc) · 5.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import itertools
import multiprocessing
import subprocess
import pandas as pd
from g2pM import G2pM
from tqdm import tqdm
from collections import Counter
from hparams import create_hparams
home_dir = os.getcwd()
# graph to pinyin and how to transfer pinyin to phoneme
def g2p_transfer(line):
return ' '.join(g2pmodel(line, tone=True, char_split=False))
# format translate
def do_single_file_trans(input_file, save_file, sample_rate=16000, target_extension="wav"):
#if original_extension in ["wav", "mp3"] and target_extension in ["wav", "mp3"]:
# sox_cmd = "sox %s -r %s -c 1 %s" % (input_file, sample_rate, save_file)
# subprocess.check_call(sox_cmd, shell=True)
#else:
# # use ffmpeg to transform it to pcm, then use sox transform it to wav
if target_extension in ["pcm", "raw"]:
ffmpeg_cmd = "ffmpeg -y -i %s -ac 1 -ar %d -f s16le %s > /dev/null 2>&1" % \
(input_file, sample_rate, save_file)
subprocess.call(ffmpeg_cmd, shell=True)
elif target_extension == "wav":
# create tmp folder for saving temp raw file, then transformed to wav, finally remove tmp file
tmp_file = save_file[:-3] + "raw"
ffmpeg_cmd = "ffmpeg -y -i %s -ac 1 -ar %d -f s16le %s > /dev/null 2>&1" % \
(input_file, sample_rate, tmp_file)
subprocess.call(ffmpeg_cmd, shell=True)
sox_cmd = "sox -r %d -c 1 -e signed-integer -b 16 %s %s > /dev/null 2>&1" % \
(sample_rate, tmp_file, save_file)
subprocess.call(sox_cmd, shell=True)
os.remove(tmp_file)
else:
ffmpeg_cmd = "ffmpeg -y -i %s %s > /dev/null > 2>&1" % (input_file, save_file)
subprocess.call(ffmpeg_cmd, shell=True)
pass
def get_split_corpus(type, wavids, texts):
if type == 'train':
data = ''
train_wavids = wavids[:int(0.85 * len(wavids))]
train_texts = texts[:int(0.85 * len(wavids))]
print('train length : ', len(train_wavids))
for wavid, text in zip(train_wavids, train_texts):
wavid = wavid.split('/')[1][:-4] + '.wav'
text = ''.join(text.split(' '))
data += wavid + '\t' + g2p_transfer(text) + '\n'
with open(hparams.training_files, 'w', encoding='utf-8') as f:
f.writelines(data)
elif type == 'test':
data = ''
test_wavids = wavids[int(0.85 * len(wavids)): int(0.95 * len(wavids))]
test_texts = texts[int(0.85 * len(wavids)): int(0.95 * len(wavids))]
print('test length : ', len(test_wavids))
for wavid, text in zip(test_wavids, test_texts):
wavid = wavid.split('/')[1][:-4] + '.wav'
text = ''.join(text.split(' '))
data += wavid + '\t' + g2p_transfer(text) + '\n'
with open(hparams.testing_files, 'w', encoding='utf-8') as f:
f.writelines(data)
elif type == 'dev':
data = ''
dev_wavids = wavids[int(0.95 * len(wavids)):]
dev_texts = texts[int(0.95 * len(wavids)):]
print('dev length : ', len(dev_wavids))
for wavid, text in zip(dev_wavids, dev_texts):
wavid = wavid.split('/')[1][:-4] + '.wav'
text = ''.join(text.split(' '))
data += wavid + '\t' + g2p_transfer(text) + '\n'
with open(hparams.validation_files, 'w', encoding='utf-8') as f:
f.writelines(data)
else:
print('type error')
exit()
def clean_data():
# pool = multiprocessing.Pool(16)
# for wavfile in tqdm(os.listdir(origin_data_dir)):
# if wavfile[-4:] == '.mp3':
# file_path = os.path.join(origin_data_dir, wavfile)
# save_file = os.path.join(data_dir, wavfile.replace('mp3', 'wav'))
# pool.apply_async(do_single_file_trans, (file_path, save_file, sample_rate, "wav"))
# # do_single_file_trans(file_path, save_file, sample_rate=sample_rate, target_extension="wav")
# pool.close()
# pool.join()
text_pd = pd.read_csv(origin_text_dir, header=None, sep='\t')
wavids = text_pd.iloc[:, 0].values
texts = text_pd.iloc[:, 1].values
l = len(wavids)
print('data length: ', l)
get_split_corpus('train', wavids, texts)
get_split_corpus('test', wavids, texts)
get_split_corpus('dev', wavids, texts)
print('data length: ', len(wavids))
def build_pinyin_vocab():
py_total_list = []
for file in os.listdir(os.path.join(home_dir, 'filelists')):
pd_data = pd.read_csv(os.path.join(home_dir, 'filelists', file), header=None, sep='\t')
py_list = pd_data.iloc[:, 1].values
py_list = [list(s) for item in py_list for s in item.split(' ')]
py_total_list.extend(py_list)
py_total_list = list(itertools.chain(*py_total_list))
dict = Counter(py_total_list)
vocab_list = sorted(dict.items(), key=lambda x: x[1], reverse=True)
vocab_list = [item for item, count in vocab_list if count > 5]
vocab_list = ['PAD', 'UNK', ' '] + vocab_list
with open(os.path.join(home_dir, 'py_vocab.txt'), 'w', encoding='utf-8') as f:
f.writelines('\n'.join(vocab_list))
if __name__ == '__main__':
sample_rate = 22050
hparams = create_hparams()
g2pmodel = G2pM()
origin_data_dir = hparams.origin_data_dir
origin_text_dir = hparams.origin_text_dir
data_dir = hparams.data_dir
text_dir = hparams.text_dir
if not os.path.exists(data_dir):
os.makedirs(data_dir, exist_ok=True)
clean_data()
# build_pinyin_vocab()