Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ jobs:
homarus:
uses: ./.github/workflows/build.yml
with:
contexts: ${{ needs.crayfish.outputs.context }}
contexts: ${{ needs.scyllaridae.outputs.context }}
image: homarus
repository: ${{ vars.repository }}
tags: ${{ needs.tags.outputs.tags }}
Expand All @@ -141,7 +141,7 @@ jobs:
houdini:
uses: ./.github/workflows/build.yml
with:
contexts: ${{ needs.crayfish.outputs.context }}
contexts: ${{ needs.scyllaridae.outputs.context }}
image: houdini
repository: ${{ vars.repository }}
tags: ${{ needs.tags.outputs.tags }}
Expand All @@ -150,7 +150,7 @@ jobs:
hypercube:
uses: ./.github/workflows/build.yml
with:
contexts: ${{ needs.crayfish.outputs.context }}
contexts: ${{ needs.scyllaridae.outputs.context }}
image: hypercube
repository: ${{ vars.repository }}
tags: ${{ needs.tags.outputs.tags }}
Expand Down Expand Up @@ -192,6 +192,15 @@ jobs:
tags: ${{ needs.tags.outputs.tags }}
secrets: inherit
needs: [tags, base]
mergepdf:
uses: ./.github/workflows/build.yml
with:
contexts: ${{ needs.scyllaridae.outputs.context }}
image: mergepdf
repository: ${{ vars.repository }}
tags: ${{ needs.tags.outputs.tags }}
secrets: inherit
needs: [tags, scyllaridae, imagemagick, leptonica]
milliner:
uses: ./.github/workflows/build.yml
with:
Expand Down Expand Up @@ -412,6 +421,13 @@ jobs:
image: mariadb
secrets: inherit
needs: [mariadb]
test-mergepdf:
uses: ./.github/workflows/test.yml
with:
digests: ${{ needs.mergepdf.outputs.digest }}
image: mergepdf
secrets: inherit
needs: [mergepdf]
test-milliner:
uses: ./.github/workflows/test.yml
with:
Expand Down
39 changes: 24 additions & 15 deletions alpaca/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
# syntax=docker/dockerfile:1.5.1
FROM gradle:6.9.4-jdk11@sha256:a21f8e1124fefb9b2affc85b7fa3bc856297056678b5cc882a61a8d6249da3a2 AS builder

ARG ALPACA_BRANCH="x-islandora-event-header"
ARG ALPACA_REPO="https://github.com/Islandora/Alpaca"

WORKDIR /build

RUN --mount=type=cache,id=alpaca-gradle,sharing=locked,target=/root/.gradle \
git clone "${ALPACA_REPO}" . && \
git checkout "${ALPACA_BRANCH}" && \
./gradlew --stacktrace clean build shadowJar --no-daemon

FROM java

ARG TARGETARCH
ARG ALPACA_VERSION="2.2.0"
ARG ALPACA_FILE="islandora-alpaca-app-${ALPACA_VERSION}-all.jar"
ARG ALPACA_URL="https://repo1.maven.org/maven2/ca/islandora/alpaca/islandora-alpaca-app/${ALPACA_VERSION}/${ALPACA_FILE}"
ARG ALPACA_FILE_SHA256="5722306dd78f9fdc3d7a4248a527c439143a3472e5b2d4ea10601b0038b43923"

# Platform agnostic does not require arch specific identifier.
RUN --mount=type=cache,id=alpaca-downloads-${TARGETARCH},sharing=locked,target=/opt/downloads \
download.sh \
--url "${ALPACA_URL}" \
--sha256 "${ALPACA_FILE_SHA256}" \
&& \
mkdir -p /opt/alpaca && \
cp "${DOWNLOAD_CACHE_DIRECTORY}/${ALPACA_FILE}" "/opt/alpaca/alpaca.jar" && \
cleanup.sh
RUN mkdir -p /opt/alpaca

COPY --from=builder /build/islandora-alpaca-app/build/libs/islandora-alpaca-app-*-all.jar /opt/alpaca/alpaca.jar

RUN create-service-user.sh --name alpaca && \
RUN test -f /opt/alpaca/alpaca.jar && \
create-service-user.sh --name alpaca && \
cleanup.sh

ENV \
Expand Down Expand Up @@ -50,13 +53,19 @@ ENV \
ALPACA_DERIVATIVE_OCR_MAX_CONSUMERS=-1 \
ALPACA_DERIVATIVE_OCR_QUEUE=queue:islandora-connector-ocr \
ALPACA_DERIVATIVE_OCR_URL=http://hypercube:8080/ \
ALPACA_DERIVATIVE_SYSTEMS=fits,homarus,houdini,ocr,transkribus \
ALPACA_DERIVATIVE_SYSTEMS=fits,homarus,houdini,ocr,transkribus,mergepdf \
ALPACA_DERIVATIVE_TRANSKRIBUS_ASYNC_CONSUMER=true \
ALPACA_DERIVATIVE_TRANSKRIBUS_CONSUMERS=-1 \
ALPACA_DERIVATIVE_TRANSKRIBUS_ENABLED=false \
ALPACA_DERIVATIVE_TRANSKRIBUS_MAX_CONSUMERS=-1 \
ALPACA_DERIVATIVE_TRANSKRIBUS_QUEUE=queue:islandora-connector-transkribus \
ALPACA_DERIVATIVE_TRANSKRIBUS_URL=http://transkribus:5000/ \
ALPACA_DERIVATIVE_MERGEPDF_ASYNC_CONSUMER=true \
ALPACA_DERIVATIVE_MERGEPDF_CONSUMERS=-1 \
ALPACA_DERIVATIVE_MERGEPDF_ENABLED=false \
ALPACA_DERIVATIVE_MERGEPDF_MAX_CONSUMERS=-1 \
ALPACA_DERIVATIVE_MERGEPDF_QUEUE=queue:islandora-connector-mergepdf \
ALPACA_DERIVATIVE_MERGEPDF_URL=http://mergepdf:8080/ \
ALPACA_FCREPO_INDEXER_ASYNC_CONSUMER=true \
ALPACA_FCREPO_INDEXER_CONSUMERS=-1 \
ALPACA_FCREPO_INDEXER_ENABLED=true \
Expand Down
8 changes: 7 additions & 1 deletion alpaca/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,13 @@ additional settings, volumes, ports, etc.
| ALPACA_DERIVATIVE_OCR_MAX_CONSUMERS | -1 | |
| ALPACA_DERIVATIVE_OCR_QUEUE | queue:islandora-connector-ocr | ActiveMQ Queue to consume from |
| ALPACA_DERIVATIVE_OCR_URL | http://hypercube:8080/ | Url of micro-service |
| ALPACA_DERIVATIVE_SYSTEMS | fits,homarus,houdini,ocr | |
| ALPACA_DERIVATIVE_MERGEPDF_ASYNC_CONSUMER | true | |
| ALPACA_DERIVATIVE_MERGEPDF_CONSUMERS | -1 | |
| ALPACA_DERIVATIVE_MERGEPDF_ENABLED | true | |
| ALPACA_DERIVATIVE_MERGEPDF_MAX_CONSUMERS | -1 | |
| ALPACA_DERIVATIVE_MERGEPDF_QUEUE | queue:islandora-connector-mergepdf | ActiveMQ Queue to consume from |
| ALPACA_DERIVATIVE_MERGEPDF_URL | http://mergepdf:8080/ | Url of micro-service |
| ALPACA_DERIVATIVE_SYSTEMS | fits,homarus,houdini,ocr,transkribus,mergepdf | |
| ALPACA_FCREPO_INDEXER_ASYNC_CONSUMER | true | |
| ALPACA_FCREPO_INDEXER_CONSUMERS | -1 | |
| ALPACA_FCREPO_INDEXER_ENABLED | true | |
Expand Down
7 changes: 7 additions & 0 deletions alpaca/rootfs/etc/confd/templates/alpaca.properties.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,10 @@ derivative.transkribus.service.url={{ getenv "ALPACA_DERIVATIVE_TRANSKRIBUS_URL"
derivative.transkribus.concurrent-consumers={{ getenv "ALPACA_DERIVATIVE_TRANSKRIBUS_CONSUMERS" }}
derivative.transkribus.max-concurrent-consumers={{ getenv "ALPACA_DERIVATIVE_TRANSKRIBUS_MAX_CONSUMERS" }}
derivative.transkribus.async-consumer={{ getenv "ALPACA_DERIVATIVE_TRANSKRIBUS_ASYNC_CONSUMER" }}

derivative.mergepdf.enabled={{ getenv "ALPACA_DERIVATIVE_MERGEPDF_ENABLED" }}
derivative.mergepdf.in.stream={{ getenv "ALPACA_DERIVATIVE_MERGEPDF_QUEUE" }}
derivative.mergepdf.service.url={{ getenv "ALPACA_DERIVATIVE_MERGEPDF_URL" }}
derivative.mergepdf.concurrent-consumers={{ getenv "ALPACA_DERIVATIVE_MERGEPDF_CONSUMERS" }}
derivative.mergepdf.max-concurrent-consumers={{ getenv "ALPACA_DERIVATIVE_MERGEPDF_MAX_CONSUMERS" }}
derivative.mergepdf.async-consumer={{ getenv "ALPACA_DERIVATIVE_MERGEPDF_ASYNC_CONSUMER" }}
28 changes: 28 additions & 0 deletions docker-bake.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ IMAGES = [
"java",
"leptonica",
"mariadb",
"mergepdf",
"milliner",
"nginx",
"postgresql",
Expand Down Expand Up @@ -49,6 +50,7 @@ DEPENDENCIES = {
hypercube = ["scyllaridae", "leptonica"]
java = ["base"]
mariadb = ["base"]
mergepdf = ["scyllaridae", "imagemagick", "leptonica"]
milliner = ["crayfish"]
nginx = ["base"]
postgresql = ["base"]
Expand Down Expand Up @@ -280,6 +282,11 @@ target "mariadb-common" {
context = "mariadb"
}

target "mergepdf-common" {
inherits = ["common"]
context = "mergepdf"
}

target "milliner-common" {
inherits = ["common"]
context = "milliner"
Expand Down Expand Up @@ -451,6 +458,13 @@ target "mariadb" {
tags = tags("mariadb", "")
}

target "mergepdf" {
inherits = ["mergepdf-common"]
contexts = dependencies("mergepdf", "")
cache-from = cacheFrom("mergepdf", hostArch())
tags = tags("mergepdf", "")
}

target "milliner" {
inherits = ["milliner-common"]
contexts = dependencies("milliner", "")
Expand Down Expand Up @@ -640,6 +654,13 @@ target "mariadb-amd64" {
tags = tags("mariadb", "amd64")
}

target "mergepdf-amd64" {
inherits = ["mergepdf-common", "amd64-common"]
contexts = dependencies("mergepdf", "amd64")
cache-from = cacheFrom("mergepdf", "amd64")
tags = tags("mergepdf", "amd64")
}

target "milliner-amd64" {
inherits = ["milliner-common", "amd64-common"]
contexts = dependencies("milliner", "amd64")
Expand Down Expand Up @@ -829,6 +850,13 @@ target "mariadb-arm64" {
tags = tags("mariadb", "arm64")
}

target "mergepdf-arm64" {
inherits = ["mergepdf-common", "arm64-common"]
contexts = dependencies("mergepdf", "arm64")
cache-from = cacheFrom("mergepdf", "arm64")
tags = tags("mergepdf", "arm64")
}

target "milliner-arm64" {
inherits = ["milliner-common", "arm64-common"]
contexts = dependencies("milliner", "arm64")
Expand Down
4 changes: 4 additions & 0 deletions mergepdf/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
build.gradle.kts
README.md
tests
tests/**/*
45 changes: 45 additions & 0 deletions mergepdf/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
FROM imagemagick
FROM leptonica
FROM scyllaridae

ARG TARGETARCH

EXPOSE 8080

WORKDIR /app

ARG \
# renovate: datasource=repology depName=alpine_3_22/tesseract-ocr
TESSERACT_VERSION=5.5.0-r2 \
# renovate: datasource=repology depName=alpine_3_22/ghostscript
GHOSTSCRIPT_VERSION=10.05.1-r0 \
# renovate: datasource=repology depName=alpine_3_22/poppler-utils
POPPLER_VERSION=25.04.0-r0

# hadolint ignore=DL3018
RUN --mount=type=bind,from=imagemagick,source=/packages,target=/packages \
--mount=type=bind,from=imagemagick,source=/etc/apk/keys,target=/etc/apk/keys \
apk add --no-cache /packages/imagemagick-*.apk

RUN --mount=type=bind,from=leptonica,source=/packages,target=/packages \
--mount=type=bind,from=leptonica,source=/etc/apk/keys,target=/etc/apk/keys \
apk update && \
apk add --no-cache \
/packages/leptonica-*.apk \
ghostscript=="${GHOSTSCRIPT_VERSION}" \
tesseract-ocr=="${TESSERACT_VERSION}" \
tesseract-ocr-data-eng=="${TESSERACT_VERSION}" \
tesseract-ocr-data-fra=="${TESSERACT_VERSION}" \
tesseract-ocr-data-spa=="${TESSERACT_VERSION}" \
tesseract-ocr-data-ita=="${TESSERACT_VERSION}" \
tesseract-ocr-data-por=="${TESSERACT_VERSION}" \
tesseract-ocr-data-hin=="${TESSERACT_VERSION}" \
tesseract-ocr-data-deu=="${TESSERACT_VERSION}" \
tesseract-ocr-data-jpn=="${TESSERACT_VERSION}" \
tesseract-ocr-data-rus=="${TESSERACT_VERSION}" \
poppler-utils=="${POPPLER_VERSION}"

ENV \
MAX_THREADS=5

COPY --link rootfs /
25 changes: 25 additions & 0 deletions mergepdf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Merge PDF

Docker image for mergepdf. Aggregate IIIF manifests for books/paged-content into a PDF.

Built from [Islandora-DevOps/isle-buildkit mergpdf](https://github.com/Islandora-DevOps/isle-buildkit/tree/main/mergepdf)

## Dependencies

Requires `islandora/scyllaridae` docker image to build. Please refer to the
[Scyllaridae Image README](../scyllaridae/README.md) for additional information including
additional settings, volumes, ports, etc.

### IIIF Manifest

The drupal site requires a route available at `/node/{node}/book-manifest`. This View is installed by default in the [views.view.iiif_manifest.yml](https://github.com/Islandora-Devops/islandora-starter-site/blob/main/config/sync/views.view.iiif_manifest.yml) config in the Islandora Starter Site.

### Taxonomy Term Name to TID

The drupal site requires a route available at `/term_from_term_name`. This View is installed by default in the [views.view.term_from_term_name.yml](https://github.com/Islandora-Devops/islandora-starter-site/blob/main/config/sync/views.view.term_from_term_name.yml) config in the Islandora Starter Site.

## Settings

| Environment Variable | Default | Description |
| :------------------- | :-------------------------------------------------------- | :---------------------------------------------------------------------- |
| MAX_THREADS | 5 | How many images to download at once from a IIIF manifest |
103 changes: 103 additions & 0 deletions mergepdf/rootfs/app/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env bash

set -eou pipefail

URL="$1/book-manifest"
TMP_DIR=$(mktemp -d)
I=0
MAX_THREADS=${MAX_THREADS:-5}
PIDS=()
RETRIES=3

cleanup() {
rm -rf "$TMP_DIR"
}

trap cleanup EXIT

# Function to download and process the image with retries
download_and_process() {
local url="$1"
local output_file="$2"
local attempt=0

while (( attempt < RETRIES )); do
if curl -s "$url" | magick - -resize 1000x\> "$output_file" > /dev/null 2>&1; then
return 0
fi
attempt=$(( attempt + 1 ))
echo "Retrying ($attempt/$RETRIES) for $url..."
sleep 1
done

echo "Failed to process $url after $RETRIES attempts." >&2
return 1
}

# Iterate over all images in the IIIF manifest
URLS=$(curl -sf "$URL" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | awk -F '/' '{print $7}' | sed -e 's/%2F/\//g' -e 's/%3A/:/g')
while read -r URL; do
# If we have reached the max thread limit, wait for any one job to finish
if [ "${#PIDS[@]}" -ge "$MAX_THREADS" ]; then
wait -n
NEW_PIDS=()
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
NEW_PIDS+=("$pid")
fi
done
PIDS=("${NEW_PIDS[@]}")
fi

# Run each job in the background
(
local_img="$TMP_DIR/img_$I.jpg"

# Download and resize the image with retry logic
if ! download_and_process "$URL" "$local_img"; then
exit 1
fi

# Make an OCR'd PDF from the image
tesseract "$local_img" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1
rm "$local_img"
) &
PIDS+=("$!")
I="$(( I + 1))"
done <<< "$URLS"

FILES=()
for index in $(seq 0 $((I - 1))); do
FILES+=("$TMP_DIR/img_${index}.pdf")
done

wait

# Make the node title the title of the PDF
TITLE=$(curl -L "$1?_format=json" | jq -r '.title[0].value' | sed 's/(/\\(/g; s/)/\\)/g')
echo "[ /Title ($TITLE)/DOCINFO pdfmark" > "$TMP_DIR/metadata.txt"

gs -dBATCH \
-dNOPAUSE \
-dQUIET \
-sDEVICE=pdfwrite \
-dPDFA \
-dNOOUTERSAVE \
-dAutoRotatePages=/None \
-sOutputFile="$TMP_DIR/ocr.pdf" \
"${FILES[@]}" \
"$TMP_DIR/metadata.txt"

# Instead of printing the PDF
# PUT it to the endpoint
NID=$(basename "$1")
BASE_URL=$(dirname "$1" | xargs dirname)
TID=$(curl "$BASE_URL/term_from_term_name?vocab=islandora_media_use&name=Original+File&_format=json" | jq '.[0].tid[0].value')
curl \
-H "Authorization: $SCYLLARIDAE_AUTH" \
-H "Content-Type: application/pdf" \
-H "Content-Location: private://derivatives/pc/pdf/$NID.pdf" \
-T "$TMP_DIR/ocr.pdf" \
"$1/media/document/$TID"

echo "OK"
Loading