Skip to content

Commit 84734f2

Browse files
committed
experiment: bash script for bundling 08.2018 arxmliv
1 parent 7daf852 commit 84734f2

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

scripts/bundle-html-dataset.sh

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
3+
# Prerequisite:
4+
# PGPASSWORD=<the cortex user password>
5+
# PGADDRESS=<the postgresql ip|localhost>
6+
# DTPATH=<the path for the created dataset>
7+
# CORPUSNAME=<the name of the corpus>
8+
# CORPUSID=<PG database id of this corpus>
9+
# CORPUSBASE=<base file system path of the corpus>
10+
# SERVICEID=<PG database id of this service>
11+
#
12+
# Example:
13+
# PGPASSWORD=cortex PGADDRESS=10.188.48.220 CORPUSNAME=arxmliv CORPUSID=8 SERVICEID=3 CORPUSBASE=/data/arxmliv DTPATH=/data/datasets/dataset-arXMLiv-08-2018 ./scripts/bundle-html-dataset.sh
14+
15+
mkdir -p $DTPATH
16+
17+
## Obtain the task lists
18+
psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-no_problem-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-1"
19+
psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-warning-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-2"
20+
psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-error-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-3"
21+
22+
# For each severity, prepare a dataset archive of HTML files
23+
severitylist="no_problem warning error"
24+
25+
for severity in $severitylist; do
26+
mkdir $DTPATH/$severity
27+
egrep -o '.+\/' $DTPATH/$CORPUSNAME-$severity-tasks.txt | while read -r line ; do
28+
YEARDIR=$(expr match $line "^$CORPUSBASE/\([0-9]*\)")
29+
SUBDIR=$(expr match $line "^$CORPUSBASE/[0-9]*/\([a-z0-9._-]*\)")
30+
FULLDIR=$(expr match $line "^\($CORPUSBASE/[0-9]*/[a-z0-9._-]*\)")
31+
32+
FILENAME=$(unzip $FULLDIR/tex_to_html.zip *.html -d $DTPATH/$severity | egrep -o '\S*\.html')
33+
if [ -f $FILENAME ]
34+
then
35+
if [ ! -d "$DTPATH/$severity/$YEARDIR" ]; then
36+
mkdir $DTPATH/$severity/$YEARDIR
37+
fi
38+
mv $FILENAME $DTPATH/$severity/$YEARDIR/$SUBDIR.html
39+
fi
40+
done
41+
42+
# Create the final dataset archive
43+
zip -9 -r $DTPATH/$CORPUSNAME-$severity.zip $DTPATH/$severity/ || exit 1;
44+
45+
rm -rf $DTPATH/$severity
46+
done
47+
48+
exit 0;

0 commit comments

Comments
 (0)