@@ -31,6 +31,7 @@ dir10=/export/corpora/LDC/LDC2018S05/
31
31
text10=/export/corpora/LDC/LDC2018T14/
32
32
33
33
mgb2_dir=" "
34
+ process_xml=" "
34
35
mer=80
35
36
36
37
. ./utils/parse_options.sh
@@ -108,13 +109,6 @@ cd $top_pwd
108
109
# prepare MGB2 data
109
110
if [ ! -z $mgb2_dir ]; then
110
111
echo " preparing MGB2 data"
111
- # check xml
112
- if [ -z $( which xml) ]; then
113
- echo " $0 : Could not find tool xml"
114
- echo " $0 : To use MGB2 you must have xml installed"
115
- echo " $0 : Download and install it from xmlstar.sourceforge.net"
116
- exit 1
117
- fi
118
112
119
113
xmldir=$mgb2_dir /train/xml/bw
120
114
output_dir=$gale_data /mgb2
@@ -126,12 +120,35 @@ if [ ! -z $mgb2_dir ]; then
126
120
mv $output_dir /mgb2 ${output_dir} /.backup
127
121
fi
128
122
129
- ls $mgb2_dir /train/wav/ | while read name; do
130
- basename=` basename -s .wav $name `
131
- [ ! -e $xmldir /$basename .xml ] && echo " Missing $xmldir /$basename .xml" && exit 1
132
- xml sel -t -m ' //segments[@annotation_id="transcript_align"]' -m " segment" -n -v " concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m " element" -v " concat(text(),' ')" $xmldir /$basename .xml | local/add_to_datadir.py $basename $output_dir $mer
133
- echo $basename $db_dir /train/wav/$basename .wav >> $output_dir /wav.scp
134
- done
123
+ if [ $process_xml == ' python' ]; then
124
+ echo " using python to process xml file"
125
+ # check if bs4 and lxml are installed in python
126
+ local/check_tools.sh
127
+ ls $mgb2_dir /train/wav/ | while read name; do
128
+ basename=` basename -s .wav $name `
129
+ [ ! -e $xmldir /$basename .xml ] && echo " Missing $xmldir /$basename .xml" && exit 1
130
+ local/process_xml.py $xmldir /$basename .xml - | local/add_to_datadir.py $basename $train_dir $mer
131
+ echo $basename $db_dir /train/wav/$basename .wav >> $output_dir /wav.scp
132
+ done
133
+ elif [ $process_xml == ' xml' ]; then
134
+ # check if xml binary exsits
135
+ if command -v xml > /dev/null 2> /dev/null; then
136
+ echo " using xml"
137
+ ls $mgb2_dir /train/wav/ | while read name; do
138
+ basename=` basename -s .wav $name `
139
+ [ ! -e $xmldir /$basename .xml ] && echo " Missing $xmldir /$basename .xml" && exit 1
140
+ xml sel -t -m ' //segments[@annotation_id="transcript_align"]' -m " segment" -n -v " concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m " element" -v " concat(text(),' ')" $xmldir /$basename .xml | local/add_to_datadir.py $basename $output_dir $mer
141
+ echo $basename $db_dir /train/wav/$basename .wav >> $output_dir /wav.scp
142
+ done
143
+ else
144
+ echo " xml not found, you may use python by '--process-xml python'"
145
+ exit 1;
146
+ fi
147
+ else
148
+ # invalid option
149
+ echo " $0 : invalid option for --process-xml, choose from 'xml' or 'python'"
150
+ exit 1;
151
+ fi
135
152
136
153
# add mgb2 data to training data (GALE/all and wav.scp)
137
154
mv $gale_data /all $gale_data /all.gale
0 commit comments