|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Prerequisite: |
| 4 | +# PGPASSWORD=<the cortex user password> |
| 5 | +# PGADDRESS=<the postgresql ip|localhost> |
| 6 | +# DTPATH=<the path for the created dataset> |
| 7 | +# CORPUSNAME=<the name of the corpus> |
| 8 | +# CORPUSID=<PG database id of this corpus> |
| 9 | +# CORPUSBASE=<base file system path of the corpus> |
| 10 | +# SERVICEID=<PG database id of this service> |
| 11 | +# |
| 12 | +# Example: |
| 13 | +# PGPASSWORD=cortex PGADDRESS=10.188.48.220 CORPUSNAME=arxmliv CORPUSID=8 SERVICEID=3 CORPUSBASE=/data/arxmliv DTPATH=/data/datasets/dataset-arXMLiv-08-2018 ./scripts/bundle-html-dataset.sh |
| 14 | + |
| 15 | +mkdir -p $DTPATH |
| 16 | + |
| 17 | +## Obtain the task lists |
| 18 | +psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-no_problem-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-1" |
| 19 | +psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-warning-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-2" |
| 20 | +psql -h $PGADDRESS -U cortex -t -o "$DTPATH/$CORPUSNAME-error-tasks.txt" -c "SELECT entry FROM tasks WHERE corpus_id=$CORPUSID and service_id=$SERVICEID and status=-3" |
| 21 | + |
| 22 | +# For each severity, prepare a dataset archive of HTML files |
| 23 | +severitylist="no_problem warning error" |
| 24 | + |
| 25 | +for severity in $severitylist; do |
| 26 | + mkdir $DTPATH/$severity |
| 27 | + egrep -o '.+\/' $DTPATH/$CORPUSNAME-$severity-tasks.txt | while read -r line ; do |
| 28 | + YEARDIR=$(expr match $line "^$CORPUSBASE/\([0-9]*\)") |
| 29 | + SUBDIR=$(expr match $line "^$CORPUSBASE/[0-9]*/\([a-z0-9._-]*\)") |
| 30 | + FULLDIR=$(expr match $line "^\($CORPUSBASE/[0-9]*/[a-z0-9._-]*\)") |
| 31 | + |
| 32 | + FILENAME=$(unzip $FULLDIR/tex_to_html.zip *.html -d $DTPATH/$severity | egrep -o '\S*\.html') |
| 33 | + if [ -f $FILENAME ] |
| 34 | + then |
| 35 | + if [ ! -d "$DTPATH/$severity/$YEARDIR" ]; then |
| 36 | + mkdir $DTPATH/$severity/$YEARDIR |
| 37 | + fi |
| 38 | + mv $FILENAME $DTPATH/$severity/$YEARDIR/$SUBDIR.html |
| 39 | + fi |
| 40 | + done |
| 41 | + |
| 42 | + # Create the final dataset archive |
| 43 | + zip -9 -r $DTPATH/$CORPUSNAME-$severity.zip $DTPATH/$severity/ || exit 1; |
| 44 | + |
| 45 | + rm -rf $DTPATH/$severity |
| 46 | +done |
| 47 | + |
| 48 | +exit 0; |
0 commit comments