|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Task: Bundles a CorTeX dataset into 3 ZIP archives, one per severity (no-problem, warning, error) |
| 4 | + |
| 5 | +# Prerequisite: |
| 6 | +# PGPASSWORD=<the cortex user password> |
| 7 | +# PGADDRESS=<the postgresql ip|localhost> |
| 8 | +# DTPATH=<the path for the created dataset> |
| 9 | +# CORPUSNAME=<the name of the corpus> |
| 10 | +# CORPUSID=<PG database id of this corpus> |
| 11 | +# CORPUSBASE=<base file system path of the corpus> |
| 12 | +# SERVICEID=<PG database id of this service> |
| 13 | +# |
| 14 | +# Example: |
| 15 | +# PGPASSWORD=cortex PGADDRESS=10.188.48.220 CORPUSNAME=arxmliv CORPUSID=8 SERVICEID=3 CORPUSBASE=/data/arxmliv DTPATH=/data/datasets/dataset-arXMLiv-2022 ./scripts/bundle-html-dataset-by-severity.sh |
| 16 | + |
| 17 | +mkdir -p $DTPATH |
| 18 | + |
| 19 | +echo "1. Obtain the task lists..." |
| 20 | + |
| 21 | +psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-no-problem-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-1 order by entry" |
| 22 | +psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-warning-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-2 order by entry" |
| 23 | +psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-error-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-3 order by entry" |
| 24 | + |
| 25 | +echo "2. Unpack into yymm (year-month) directories..." |
| 26 | + |
| 27 | +all_years="91 92 93 94 95 96 97 98 99 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24" |
| 28 | +all_months="01 02 03 04 05 06 07 08 09 10 11 12" |
| 29 | +severitylist="no-problem warning error" |
| 30 | + |
| 31 | +for severity in $severitylist; do |
| 32 | + # make it resumable from partially completed .zip state |
| 33 | + if [ -f "$DTPATH/$CORPUSNAME-$severity.zip" ] ; then |
| 34 | + continue |
| 35 | + fi |
| 36 | + if [ ! -d "$DTPATH/$severity" ] ; then |
| 37 | + mkdir $DTPATH/$severity |
| 38 | + fi |
| 39 | + for yy in $all_years; do |
| 40 | + for mm in $all_months; do |
| 41 | + yymm="$yy$mm" |
| 42 | + echo "-- copy papers for $severity:$yymm" |
| 43 | + egrep -o ".+\/$yymm\/.+\/" $DTPATH/$CORPUSNAME-$severity-tasks.txt | while read -r line ; do |
| 44 | + SUBDIR=$(expr match $line "^$CORPUSBASE/[0-9]*/\([a-z0-9._-]*\)") |
| 45 | + FULLDIR=$(expr match $line "^\($CORPUSBASE/[0-9]*/[a-z0-9._-]*\)") |
| 46 | + HTMLFILE="$DTPATH/$severity/$yymm/$SUBDIR.html" |
| 47 | + if [ ! -d "$DTPATH/$severity/$yymm" ] ; then |
| 48 | + mkdir $DTPATH/$severity/$yymm |
| 49 | + fi |
| 50 | + if [ -f $HTMLFILE ] ; then # skip unzipping existing files |
| 51 | + continue |
| 52 | + fi |
| 53 | + FILENAME=$(unzip -n $FULLDIR/tex_to_html.zip *.html -d $DTPATH/$severity/$yymm | egrep -o '\S*\.html') |
| 54 | + if [[ -f $FILENAME ]] && [[ "$FILENAME" != "$HTMLFILE" ]] ; |
| 55 | + then |
| 56 | + mv -f $FILENAME $HTMLFILE |
| 57 | + fi |
| 58 | + done |
| 59 | + done |
| 60 | + done |
| 61 | + if [ -d "$DTPATH/$severity" ]; then |
| 62 | + echo "-- create archive for $severity" |
| 63 | + cd $DTPATH |
| 64 | + zip -9 -v -r $CORPUSNAME-$severity.zip $severity |
| 65 | + echo "-- free space for $severity" |
| 66 | + rm -rf $DTPATH/$severity |
| 67 | + fi |
| 68 | +done |
| 69 | + |
| 70 | +echo "Done!" |
| 71 | +exit 0; |
0 commit comments