Skip to content

Commit 5882dc5

Browse files
authored
[egs] gale_arabic: add python script to process xml file (#3886)
1 parent 1121c31 commit 5882dc5

File tree

8 files changed

+166
-27
lines changed

8 files changed

+166
-27
lines changed
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
# check whether bs4 and lxml is installed
4+
if ! python3 -c "import bs4" 2>/dev/null; then
5+
echo "$0: BeautifulSoup4 not installed, you can install it by 'pip install beautifulsoup4' if you prefer to use python to process xml file"
6+
exit 1;
7+
fi
8+
9+
if ! python3 -c "import lxml" 2>/dev/null; then
10+
echo "$0: lxml not installed, you can install it by 'pip install lxml' if you prefer to use python to process xml file"
11+
exit 1;
12+
fi
13+
14+
echo "both BeatufileSoup4 and lxml are installed in python"
15+
exit 0

egs/gale_arabic/s5d/local/prepare_data.sh

+30-13
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ dir10=/export/corpora/LDC/LDC2018S05/
3131
text10=/export/corpora/LDC/LDC2018T14/
3232

3333
mgb2_dir=""
34+
process_xml=""
3435
mer=80
3536

3637
. ./utils/parse_options.sh
@@ -108,13 +109,6 @@ cd $top_pwd
108109
# prepare MGB2 data
109110
if [ ! -z $mgb2_dir ]; then
110111
echo "preparing MGB2 data"
111-
# check xml
112-
if [ -z $(which xml) ]; then
113-
echo "$0: Could not find tool xml"
114-
echo "$0: To use MGB2 you must have xml installed"
115-
echo "$0: Download and install it from xmlstar.sourceforge.net"
116-
exit 1
117-
fi
118112

119113
xmldir=$mgb2_dir/train/xml/bw
120114
output_dir=$gale_data/mgb2
@@ -126,12 +120,35 @@ if [ ! -z $mgb2_dir ]; then
126120
mv $output_dir/mgb2 ${output_dir}/.backup
127121
fi
128122

129-
ls $mgb2_dir/train/wav/ | while read name; do
130-
basename=`basename -s .wav $name`
131-
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
132-
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $output_dir $mer
133-
echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp
134-
done
123+
if [ $process_xml == 'python' ]; then
124+
echo "using python to process xml file"
125+
# check if bs4 and lxml are installed in python
126+
local/check_tools.sh
127+
ls $mgb2_dir/train/wav/ | while read name; do
128+
basename=`basename -s .wav $name`
129+
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
130+
local/process_xml.py $xmldir/$basename.xml - | local/add_to_datadir.py $basename $train_dir $mer
131+
echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp
132+
done
133+
elif [ $process_xml == 'xml' ]; then
134+
# check if xml binary exsits
135+
if command -v xml >/dev/null 2>/dev/null; then
136+
echo "using xml"
137+
ls $mgb2_dir/train/wav/ | while read name; do
138+
basename=`basename -s .wav $name`
139+
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
140+
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $output_dir $mer
141+
echo $basename $db_dir/train/wav/$basename.wav >> $output_dir/wav.scp
142+
done
143+
else
144+
echo "xml not found, you may use python by '--process-xml python'"
145+
exit 1;
146+
fi
147+
else
148+
# invalid option
149+
echo "$0: invalid option for --process-xml, choose from 'xml' or 'python'"
150+
exit 1;
151+
fi
135152

136153
# add mgb2 data to training data (GALE/all and wav.scp)
137154
mv $gale_data/all $gale_data/all.gale
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python3
2+
3+
from bs4 import BeautifulSoup
4+
import sys
5+
import argparse
6+
7+
def get_args():
8+
parser = argparse.ArgumentParser(description="""This script process xml file.""")
9+
parser.add_argument("xml", type=str, help="""Input xml file""")
10+
parser.add_argument("output", type=str, help="""output text file""")
11+
args = parser.parse_args()
12+
return args
13+
14+
def process_xml(xml_handle, output_handle):
15+
soup = BeautifulSoup(xml_handle, "xml")
16+
for segment in soup.find_all("segment"):
17+
who = segment["who"]
18+
starttime = segment["starttime"]
19+
endtime = segment["endtime"]
20+
WMER = segment["WMER"]
21+
text = " ".join([element.string for element in segment.find_all("element") if element.string != None])
22+
output_handle.write("{} {} {} {} {}\n".format(who, starttime, endtime, WMER, text))
23+
xml_handle.close()
24+
output_handle.close()
25+
26+
def main():
27+
args = get_args()
28+
29+
xml_handle = open(args.xml, 'r')
30+
output_handle = sys.stdout if args.output == '-' else open(args.output, 'w')
31+
32+
process_xml(xml_handle, output_handle)
33+
34+
if __name__ == "__main__":
35+
main()

egs/gale_arabic/s5d/run.sh

+4-1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ galeData=GALE
4646
mgb2_dir=""
4747
giga_dir=""
4848

49+
# preference on how to process xml file (use xml binary or python)
50+
process_xml=""
51+
4952
run_rnnlm=false
5053
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
5154
## This relates to the queue.
@@ -64,7 +67,7 @@ if [ $stage -le 0 ]; then
6467
echo "$0: Preparing data..."
6568

6669
options=""
67-
[ ! -z $mgb2_dir ] && options="--mgb2-dir $mgb2_dir"
70+
[ ! -z $mgb2_dir ] && options="--process-xml python --mgb2-dir $mgb2_dir"
6871
local/prepare_data.sh $options
6972

7073
echo "$0: Preparing lexicon and LM..."
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
# check whether bs4 and lxml is installed
4+
if ! python3 -c "import bs4" 2>/dev/null; then
5+
echo "$0: BeautifulSoup4 not installed, you can install it by 'pip install beautifulsoup4' if you prefer to use python to process xml file"
6+
exit 1;
7+
fi
8+
9+
if ! python3 -c "import lxml" 2>/dev/null; then
10+
echo "$0: lxml not installed, you can install it by 'pip install lxml' if you prefer to use python to process xml file"
11+
exit 1;
12+
fi
13+
14+
echo "both BeatufileSoup4 and lxml are installed in python"
15+
exit 0

egs/mgb2_arabic/s5/local/mgb_data_prep.sh

+28-12
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
# 2016-2019 Vimal Manohar
55
# 2019 Dongji Gao
66

7-
if [ $# -ne 2 ]; then
8-
echo "Usage: $0 <DB-dir> <mer-sel>"
7+
if [ $# -ne 3 ]; then
8+
echo "Usage: $0 <DB-dir> <mer-sel> <process-xml>"
99
exit 1;
1010
fi
1111

@@ -23,12 +23,6 @@ for x in $train_dir $dev_dir; do
2323
fi
2424
done
2525

26-
if [ -z $(which xml) ]; then
27-
echo "$0: Could not find tool xml"
28-
echo "$0: Download and install it from xmlstar.sourceforge.net"
29-
exit 1
30-
fi
31-
3226
find $db_dir/train/wav -type f -name "*.wav" | \
3327
awk -F/ '{print $NF}' | perl -pe 's/\.wav//g' > \
3428
$train_dir/wav_list
@@ -39,11 +33,33 @@ head -500 $train_dir/wav_list > $train_dir/wav_list.short
3933
set -e -o pipefail
4034

4135
xmldir=$db_dir/train/xml/bw
42-
cat $train_dir/wav_list | while read basename; do
36+
if [ $process_xml == "python" ]; then
37+
echo "using python to process xml file"
38+
# check if bs4 and lxml are installin in python
39+
local/check_tools.sh
40+
# process xml file using python
41+
cat $train_dir/wav_list | while read basename; do
4342
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
44-
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $train_dir $mer
45-
echo $basename $wavDir/$basename.wav >> $train_dir/wav.scp
46-
done
43+
local/process_xml.py $xmldir/$basename.xml - | local/add_to_datadir.py $basename $train_dir $mer
44+
done
45+
elif [ $process_xml == 'xml' ]; then
46+
# check if xml binary exsits
47+
if command -v xml >/dev/null 2>/dev/null; then
48+
echo "using xml"
49+
cat $train_dir/wav_list | while read basename; do
50+
[ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
51+
xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $train_dir $mer
52+
echo $basename $wavDir/$basename.wav >> $train_dir/wav.scp
53+
done
54+
else
55+
echo "xml not found, you may use python by '--process-xml python'"
56+
exit 1;
57+
fi
58+
else
59+
# invalid option
60+
echo "$0: invalid option for --process-xml, choose from 'xml' or 'python'"
61+
exit 1;
62+
fi
4763

4864
for x in text segments; do
4965
cp $db_dir/dev/${x}.all $dev_dir/${x}
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python3
2+
3+
from bs4 import BeautifulSoup
4+
import sys
5+
import argparse
6+
7+
def get_args():
8+
parser = argparse.ArgumentParser(description="""This script process xml file.""")
9+
parser.add_argument("xml", type=str, help="""Input xml file""")
10+
parser.add_argument("output", type=str, help="""output text file""")
11+
args = parser.parse_args()
12+
return args
13+
14+
def process_xml(xml_handle, output_handle):
15+
soup = BeautifulSoup(xml_handle, "xml")
16+
for segment in soup.find_all("segment"):
17+
who = segment["who"]
18+
starttime = segment["starttime"]
19+
endtime = segment["endtime"]
20+
WMER = segment["WMER"]
21+
text = " ".join([element.string for element in segment.find_all("element") if element.string != None])
22+
output_handle.write("{} {} {} {} {}\n".format(who, starttime, endtime, WMER, text))
23+
xml_handle.close()
24+
output_handle.close()
25+
26+
def main():
27+
args = get_args()
28+
29+
xml_handle = open(args.xml, 'r')
30+
output_handle = sys.stdout if args.output == '-' else open(args.output, 'w')
31+
32+
process_xml(xml_handle, output_handle)
33+
34+
if __name__ == "__main__":
35+
main()

egs/mgb2_arabic/s5/run.sh

+4-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
stage=-1
88

9+
# preference on how to process xml file [python, xml]
10+
process_xml="python"
11+
912
. ./cmd.sh
1013
if [ -f ./path.sh ]; then . ./path.sh; fi
1114
. utils/parse_options.sh
@@ -50,7 +53,7 @@ fi
5053
if [ $stage -le 1 ]; then
5154
#DATA PREPARATION
5255
echo "Preparing training data"
53-
local/mgb_data_prep.sh DB $mer
56+
local/mgb_data_prep.sh DB $mer $process_xml
5457
fi
5558

5659
if [ $stage -le 2 ]; then

0 commit comments

Comments
 (0)