-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathunbox_openSAT_output.py
More file actions
163 lines (136 loc) · 5.45 KB
/
unbox_openSAT_output.py
File metadata and controls
163 lines (136 loc) · 5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
#
# author = julien karadayi
"""
read OpenSAT output.
OpenSAT classifies the wav in 18 classes:
0 background 6 music 12 noise_ongoing
1 speech 7 human 13 noise_pulse
2 speech_ne 8 cheer 14 noise_tone
3 mumble 9 crowd 15 noise_nature
4 singing 10 animal 16 white_noise
5 music_sing 11 engine 17 radio
and writes the output in a pickle.
"""
import pickle
import numpy
import ipdb
import subprocess
import shlex
import argparse
import os
def open_pickle(folder):
"""Open pickle file found in folder
and output its content.
"""
# if file ends in .gz, unzip it, else read it directly
if os.path.isfile(os.path.join(folder, 'confidence.pkl.gz')):
cmd = "gunzip {}".format(os.path.join(folder, 'confidence.pkl.gz'))
subprocess.call(shlex.split(cmd))
with open(os.path.join(folder, 'confidence.pkl'), 'rb') as fin:
u = pickle._Unpickler(fin)
u.encoding = 'latin1'
p = u.load()
return p
def speech_activity_detection(classif):
"""Take OpenSat output matrix and
retreive Speech Activity Detection from the
column 1 and 2 (which correspond to english and non english)
"""
# get the name of the speaker by getting the name
# of the folder.
# TODO : check if there's a prettier way of doing this ?
# instead of checking if last char is "/"
# if folder.endswith('/'):
# basename = os.path.basename(folder[:-1])
# else:
# basename = os.path.basename(folder)
# assert basename != '', 'ERROR, basename is empty'
for key in classif.keys():
# there should be only 1 key per file
post = classif[key]
# find out best classes
best_classes = numpy.zeros((post.shape[0],2))
# first column is for timestamps
best_classes[:,0] = numpy.linspace(-2, 59.9, 620)
# at each timestamp, take class with highest probability.
# TODO : should refine this with a threshold
# to see for cases where there's not much sound
best_classes[:,1] = numpy.argmax(post, axis=1)
# ipdb.set_trace()
# keep only 5 first classes and class 7 human
# reminder :
# 0 background
# 1 speech
# 2 speech_ne
# 3 mumble
# 4 singing
# 7 human
best_classes[
numpy.logical_and(best_classes[:,1] > 4,
best_classes[:,1] != 7),1] = 0
return best_classes
def classes2rttm(best_classes, basename, output_folder):
"""
Input :
array containing, for each timestamp, the best class
Output:
a text file, in rttm format, indicating speech sections
"""
out_file = os.path.join(output_folder, basename + '.rttm')
with open(out_file, 'w') as fout:
prev_class = -10
for time, classes in best_classes:
classes = max(0, min(1, classes))
# 3 cases: first window, window in the middle, last window
if numpy.isclose(time, 0):
# keep track of classes only as 1 for speech or
# 0 for non speech
prev_class = classes
if prev_class == 1:
prev_onset = time
elif time > 0:
if (prev_class == 1 and classes == 0):
fout.write(u'SPEAKER\t{}\t1\t{}\t{}\t'
'<NA>\t<NA>\tspeech\t<NA>\n'.format(
basename, prev_onset, time - prev_onset))
elif (prev_class == 0 and classes == 1):
prev_onset = time
prev_class = classes
else:
prev_class = classes
elif time == best_classes[-1,0]:
# check if treating last window, and if
# it's the case, write output if needed
if (prev_class == 1 and classes == 0):
fout.write(u'SPEAKER\t{}\t1\t{}\t{}\t'
'<NA>\t<NA>\tspeech\t<NA>\n'.format(
basename, prev_onset, time - prev_onset))
elif (prev_class == 0 and classes == 1):
# weird case, should not happen but just in case :
# if speech is detected ONLY in the last frame, write it
fout.write(u'SPEAKER\t{}\t1\t{}\t{}\t'
'<NA>\t<NA>\tspeech\t<NA>\n'.format(
basename, time, 0.1))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input', metavar='input',
help='input folder')
parser.add_argument('output', metavar='input',
help='output folder in which we write the RTTM')
args = parser.parse_args()
if not os.path.isdir(args.output):
os.makedirs(args.output)
# tree is input_folder/
# spkr_names/
# wav_names/
# confidence.pkl.gz
for speakers in os.listdir(args.input):
for wav_name in os.listdir(os.path.join(args.input, speakers)):
# open pickles
matrix_fold = os.path.join(args.input, speakers, wav_name)
post = open_pickle(matrix_fold)
# go from post to classes
classes = speech_activity_detection(post)
# write RTTM indicating speech/non speech
classes2rttm(classes, wav_name, args.output)