-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyse.sh
More file actions
executable file
·61 lines (53 loc) · 3.19 KB
/
analyse.sh
File metadata and controls
executable file
·61 lines (53 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/bin/bash
cat corpus.kal.ids | ~/langtech/kal/tools/shellscripts/kal-tokenise ~/langtech/kal/tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst | vislcg3 -g ~/langtech/kal/src/cg3/kal-pre1.cg3 | ~/langtech/kal/tools/shellscripts/kal-hybrid-split ~/langtech/kal/src/fst/generator-gt-desc.hfstol | ~/langtech/katersat/apply-sems.py --last | cg-sort | vislcg3 -g ~/langtech/kal/src/cg3/kal-pre2.cg3 | vislcg3 -g ~/langtech/kal/src/cg3/disambiguator.cg3 | vislcg3 -g ~/langtech/kal/src/cg3/functions.cg3 | vislcg3 -g ~/langtech/kal/src/cg3/dependency.cg3 > corpus.kal.ana
cat corpus.kal.ana | ./record-tags.py > tags.tsv
cat corpus.kal.ana | ./remorph.py > corpus.kal.plain &
cat corpus.kal.ana | ./remorph.py -t > corpus.kal.tags &
cat corpus.kal.ana | ./remorph.py -p > corpus.kal.tags-pua &
cat corpus.kal.ana | ./remorph.py -s > corpus.kal.tags-sem &
cat corpus.kal.ana | ./remorph.py -s -p > corpus.kal.tags-sem-pua &
cat corpus.kal.ana | ./remorph.py -f > corpus.kal.tags-syntax &
cat corpus.kal.ana | ./remorph.py -f -p > corpus.kal.tags-syntax-pua &
cat corpus.dan.ana | grep -v '<nl>' | grep -v 'ß' | grep -v '¶' | perl -wpne 's/^\s+"/\t"/g; while (s/(".[^=]+)=/$1 /g) {}' | ./remorph.py | perl -wpne 's/(^| )\$(\S)/$1$2/g;' > corpus.dan.plain &
wait
head -n -5000 corpus.dan.plain > corpus.dan.plain.body
tail -n 5000 corpus.dan.plain > corpus.dan.plain.val-test
head -n 2500 corpus.dan.plain.val-test > corpus.dan.plain.val
tail -n 2500 corpus.dan.plain.val-test > corpus.dan.plain.test
for DIR in plain tags tags-pua tags-sem tags-sem-pua tags-syntax tags-syntax-pua
do
mkdir -pv $DIR
head -n -5000 corpus.kal.$DIR > $DIR/corpus.kal
tail -n 5000 corpus.kal.$DIR > val-test.tmp
head -n 2500 val-test.tmp > $DIR/corpus.kal.val
tail -n 2500 val-test.tmp > $DIR/corpus.kal.test
rm -fv val-test.tmp
done
for DIR in plain tags tags-pua tags-sem tags-sem-pua tags-syntax tags-syntax-pua
do
cat $DIR/corpus.kal corpus.dan.plain.body | subword-nmt learn-bpe -s 10000 > $DIR/tokens.bpe &
done
wait
for DIR in plain tags tags-pua tags-sem tags-sem-pua tags-syntax tags-syntax-pua
do
cat corpus.dan.plain.body | subword-nmt apply-bpe -c $DIR/tokens.bpe > $DIR/corpus.dan.bpe &
cat corpus.dan.plain.val | subword-nmt apply-bpe -c $DIR/tokens.bpe > $DIR/corpus.dan.val.bpe &
cat corpus.dan.plain.test | subword-nmt apply-bpe -c $DIR/tokens.bpe > $DIR/corpus.dan.test.bpe &
cat $DIR/corpus.kal | subword-nmt apply-bpe -c $DIR/tokens.bpe > $DIR/corpus.kal.bpe &
cat $DIR/corpus.kal.val | subword-nmt apply-bpe -c $DIR/tokens.bpe > $DIR/corpus.kal.val.bpe &
cat $DIR/corpus.kal.test | subword-nmt apply-bpe -c $DIR/tokens.bpe > $DIR/corpus.kal.test.bpe &
done
wait
for DIR in plain tags tags-pua tags-sem tags-sem-pua tags-syntax tags-syntax-pua
do
cat $DIR/corpus.*.bpe | ~/oqaa/nmt/marian/build/marian-vocab --max-size 32000 > $DIR/vocab.yaml &
done
wait
for DIR in plain tags tags-pua tags-sem tags-sem-pua tags-syntax tags-syntax-pua
do
ln -sf ~/oqaa/nmt/mosesdecoder/scripts/generic/multi-bleu-detok.perl $DIR/multi-bleu-detok.perl
ln -sf ../corpus.dan.plain $DIR/corpus.dan
ln -sf ../corpus.dan.plain.val $DIR/corpus.dan.val
ln -sf ../corpus.dan.plain.test $DIR/corpus.dan.test
ln -sf ../validate.sh $DIR/validate.sh
done