Skip to content

Commit 4dd204b

Browse files
committed
dataset bundler updates
1 parent ea1c505 commit 4dd204b

File tree

2 files changed

+72
-1
lines changed

2 files changed

+72
-1
lines changed
+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
3+
# Task: Bundles a CorTeX dataset into 3 ZIP archives, one per severity (no-problem, warning, error)
4+
5+
# Prerequisite:
6+
# PGPASSWORD=<the cortex user password>
7+
# PGADDRESS=<the postgresql ip|localhost>
8+
# DTPATH=<the path for the created dataset>
9+
# CORPUSNAME=<the name of the corpus>
10+
# CORPUSID=<PG database id of this corpus>
11+
# CORPUSBASE=<base file system path of the corpus>
12+
# SERVICEID=<PG database id of this service>
13+
#
14+
# Example:
15+
# PGPASSWORD=cortex PGADDRESS=10.188.48.220 CORPUSNAME=arxmliv CORPUSID=8 SERVICEID=3 CORPUSBASE=/data/arxmliv DTPATH=/data/datasets/dataset-arXMLiv-2022 ./scripts/bundle-html-dataset-by-severity.sh
16+
17+
mkdir -p $DTPATH
18+
19+
echo "1. Obtain the task lists..."
20+
21+
psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-no-problem-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-1 order by entry"
22+
psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-warning-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-2 order by entry"
23+
psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-error-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-3 order by entry"
24+
25+
echo "2. Unpack into yymm (year-month) directories..."
26+
27+
all_years="91 92 93 94 95 96 97 98 99 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24"
28+
all_months="01 02 03 04 05 06 07 08 09 10 11 12"
29+
severitylist="no-problem warning error"
30+
31+
for severity in $severitylist; do
32+
# make it resumable from partially completed .zip state
33+
if [ -f "$DTPATH/$CORPUSNAME-$severity.zip" ] ; then
34+
continue
35+
fi
36+
if [ ! -d "$DTPATH/$severity" ] ; then
37+
mkdir $DTPATH/$severity
38+
fi
39+
for yy in $all_years; do
40+
for mm in $all_months; do
41+
yymm="$yy$mm"
42+
echo "-- copy papers for $severity:$yymm"
43+
egrep -o ".+\/$yymm\/.+\/" $DTPATH/$CORPUSNAME-$severity-tasks.txt | while read -r line ; do
44+
SUBDIR=$(expr match $line "^$CORPUSBASE/[0-9]*/\([a-z0-9._-]*\)")
45+
FULLDIR=$(expr match $line "^\($CORPUSBASE/[0-9]*/[a-z0-9._-]*\)")
46+
HTMLFILE="$DTPATH/$severity/$yymm/$SUBDIR.html"
47+
if [ ! -d "$DTPATH/$severity/$yymm" ] ; then
48+
mkdir $DTPATH/$severity/$yymm
49+
fi
50+
if [ -f $HTMLFILE ] ; then # skip unzipping existing files
51+
continue
52+
fi
53+
FILENAME=$(unzip -n $FULLDIR/tex_to_html.zip *.html -d $DTPATH/$severity/$yymm | egrep -o '\S*\.html')
54+
if [[ -f $FILENAME ]] && [[ "$FILENAME" != "$HTMLFILE" ]] ;
55+
then
56+
mv -f $FILENAME $HTMLFILE
57+
fi
58+
done
59+
done
60+
done
61+
if [ -d "$DTPATH/$severity" ]; then
62+
echo "-- create archive for $severity"
63+
cd $DTPATH
64+
zip -9 -v -r $CORPUSNAME-$severity.zip $severity
65+
echo "-- free space for $severity"
66+
rm -rf $DTPATH/$severity
67+
fi
68+
done
69+
70+
echo "Done!"
71+
exit 0;

scripts/bundle-html-dataset.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-error-tasks.txt" -c "SEL
2222

2323
echo "2. Unpack into yymm (year-month) directories..."
2424

25-
all_years="91 92 93 94 95 96 97 98 99 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22"
25+
all_years="91 92 93 94 95 96 97 98 99 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24"
2626
all_months="01 02 03 04 05 06 07 08 09 10 11 12"
2727
severitylist="no_problem warning error"
2828

0 commit comments

Comments
 (0)