Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 49 additions & 22 deletions pdf2archive
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash

# PDF2ARCHIVE 0.4-alpha
# PDF2ARCHIVE 0.4
# (C) 2018 Matteo Seclì <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
Expand All @@ -19,14 +19,16 @@

#=====# INITIALIZE VARIABLES #=====#
unset CDPATH
VERSION="0.4-alpha"
VERSION="0.4"
INPUT=""
OUTPUT=""
QUALITYOPTS=""
DEBUG=false
VALIDATE=false
MSGOPTS="-dQUIET -sstdout=/dev/null"
VERAMSGOPTS=""
GSBIN="gs"
PDFTKBIN="pdftk"
#ERROPTS="2>/dev/null"

# TODO: REPLACE == WITH = FOR COMPATIBILITY?
Expand Down Expand Up @@ -78,6 +80,7 @@ OPTIONS:
--validate Validate the resulting file. The validation is done with
VeraPDF, you need a working Java installation.
--debug Write additional debug information on screen
--gspath=<value> Optional path to ghostscript binary. Example /usr/bin/gs
-v, --version Show the program version

LICENSE:
Expand Down Expand Up @@ -117,7 +120,7 @@ get_source_path() {
install() {
if [ "$(which tar)" == "" ]; then
echo " ERROR: tar binary not found!"
exit
exit 1
fi
echo "=== Welcome to the installer of PDF2ARCHIVE ==="
INSTALLPREFIX="$1"
Expand Down Expand Up @@ -234,7 +237,7 @@ download_latest() {
update() {
if [ "$(which curl)" == "" ]; then
echo " ERROR: curl binary not found!"
exit
exit 1
fi
LATESTVERSION=$(curl -s "https://github.com/matteosecli/pdf2archive/releases/latest" | grep -o 'tag/[v.0-9]*' | awk -F/v '{print $2}')
[ -z "$LATESTVERSION" ] && { echo " Could not determine the latest version! Check your internet connection."; exit; }
Expand Down Expand Up @@ -293,11 +296,6 @@ run() {
}


#=====# CHECKS #=====#
if [ "$(which gs)" == "" ]; then
echo " ERROR: Ghostscript is not installed or it's not in the path"
exit
fi


#=====# INPUT PARSER #=====#
Expand Down Expand Up @@ -350,6 +348,9 @@ while [ "$1" != "" ]; do
exit 1
fi
;;
--gspath)
GSBIN="$VALUE"
;;
--cleanmetadata)
[ -z ${PDFTITLE+x} ] && PDFTITLE=""
[ -z ${PDFAUTHOR+x} ] && PDFAUTHOR=""
Expand Down Expand Up @@ -380,7 +381,7 @@ while [ "$1" != "" ]; do
fi
VALIDATE=true
;;
*.pdf)
*)
if [ "$INPUT" == "" ]; then
INPUT=$PARAM
elif [ "$OUTPUT" == "" ]; then
Expand All @@ -391,27 +392,51 @@ while [ "$1" != "" ]; do
exit 1
fi
;;
*)
echo " ERROR: unknown parameter \"$PARAM\""
help
exit 1
;;
#*)
# echo " ERROR: unknown parameter \"$PARAM\""
# help
# exit 1
# ;;
esac
shift
done

#=====# CHECKS #=====#
if [ "$(which $GSBIN)" == "" ]; then
echo " ERROR: Ghostscript is not installed or it's not in the path"
exit 1
fi

#=====# SET UP ALL THE STUFF #=====#
echo "=== Welcome to PDF2ARCHIVE ==="
if [ "$OUTPUT" == "" ]; then
OUTPUT="${INPUT%.pdf}-PDFA.pdf"
fi

get_source_path
TMPFILE=$(mktemp)
TMPDIR=$(mktemp -d)
PSTMPFILE=$TMPDIR/PDFA_def.ps
ICCTMPFILE=$TMPDIR/AdobeRGB1998.icc
INFOTMPFILE=$TMPDIR/pdf_minimal_info.ps
TMPINPUTFILE=$(mktemp $TMPDIR/XXXXXXXXXX.pdf)

if [ "$(which $PDFTKBIN)" == "" ]; then
echo " WARN: pdftk is not installed or it's not in the path"
cp $INPUT $TMPINPUTFILE
else
echo " Clean pdf metadata from wrong encoding and invalid utf-8 characters"
#Export metadata
echo " Export metadata to $TMPFILE"
$PDFTKBIN $INPUT dump_data output $TMPFILE
#Replace en dash with dash
echo " Replace en dash in $TMPFILE"
sed -i -e 's/&#8211;/-/g' $TMPFILE
echo " Importmetadata into $TMPINPUTFILE"
#Import metadata again
$PDFTKBIN $INPUT update_info $TMPFILE output $TMPINPUTFILE
fi

echo \
"%!PS
% Extract PDF info in a minimal way.
Expand Down Expand Up @@ -442,7 +467,7 @@ quit
# use 'LC_CTYPE=C && LANG=C && echo "$METADUMP" ...' in the
# variable assignments; however, this produces bad PDF files.
#
METADUMP=$(gs -dNODISPLAY -q -sFile="$INPUT" $INFOTMPFILE | iconv -f utf-8 -t utf-8 -c)
METADUMP=$($GSBIN -dNODISPLAY -q -sFile="$TMPINPUTFILE" $INFOTMPFILE | iconv -f utf-8 -t utf-8 -c)
[ -z ${PDFTITLE+x} ] && PDFTITLE=$(echo "$METADUMP" | grep "__knowninfoTitle: " | sed "s/^__knowninfoTitle: //g")
[ -z ${PDFAUTHOR+x} ] && PDFAUTHOR=$(echo "$METADUMP" | grep "__knowninfoAuthor: " | sed "s/^__knowninfoAuthor: //g")
[ -z ${PDFSUBJECT+x} ] && PDFSUBJECT=$(echo "$METADUMP" | grep "__knowninfoSubject: " | sed "s/^__knowninfoSubject: //g")
Expand All @@ -465,7 +490,7 @@ fi
#=====# PRINT DEBUG INFO #=====#
if $DEBUG; then
echo " DEBUG: running PDF2ARCHIVE, version $VERSION"
echo " DEBUG: using Ghostscript binary at $(which gs), version $(gs --version)"
echo " DEBUG: using Ghostscript binary at $(which $GSBIN), version $($GSBIN --version)"
echo " DEBUG: the input file is '$INPUT'"
echo " DEBUG: the output file is '$OUTPUT'"
echo " DEBUG: the intermediate processing file is $TMPFILE"
Expand Down Expand Up @@ -544,28 +569,30 @@ echo -n -e "\\x00\\x00\\x02\\x30\\x41\\x44\\x42\\x45\\x02\\x10\\x00\\x00\\x6d\\x


#=====# DO THE ACTUAL CONVERSION #=====#
# Remove -dUseCIEColor its deprecated
echo " Compressing PDF & embedding fonts..."
run gs $MSGOPTS \
run $GSBIN $MSGOPTS \
-dBATCH -dNOPAUSE -dNOOUTERSAVE \
-dCompatibilityLevel=1.4 \
-dEmbedAllFonts=true -dSubsetFonts=true \
-dCompressFonts=true -dCompressPages=true \
-dUseCIEColor -sColorConversionStrategy=RGB \
-sColorConversionStrategy=RGB \
-dDownsampleMonoImages=false -dDownsampleGrayImages=false -dDownsampleColorImages=false \
-dAutoFilterColorImages=false -dAutoFilterGrayImages=false \
-sDEVICE=pdfwrite \
-sOutputFile=$TMPFILE $INPUT
-sOutputFile=$TMPFILE $TMPINPUTFILE
echo " Converting to PDF/A-1B..."
run gs $MSGOPTS \
run $GSBIN $MSGOPTS \
-dPDFA=1 -dBATCH -dNOPAUSE -dNOOUTERSAVE \
$QUALITYOPTS \
-dCompatibilityLevel=1.4 -dPDFACompatibilityPolicy=1 \
-dUseCIEColor -sProcessColorModel=DeviceRGB -sColorConversionStrategy=RGB \
-sProcessColorModel=DeviceRGB -sColorConversionStrategy=RGB \
-sOutputICCProfile=$ICCTMPFILE \
-sDEVICE=pdfwrite \
-sOutputFile=$OUTPUT $TMPFILE $PSTMPFILE
echo " Removing temporary files..."
rm $TMPFILE
rm $TMPINPUTFILE
echo " Done, now ESSE3 is happy! ;)"


Expand Down