diff --git a/.github/workflows/lint-test-build.yml b/.github/workflows/lint-test-build.yml index 0847ea4..12753d3 100644 --- a/.github/workflows/lint-test-build.yml +++ b/.github/workflows/lint-test-build.yml @@ -64,7 +64,7 @@ jobs: - name: Find docker files id: images run: | - dockerFiles=$(find examples -name Dockerfile | grep -v coverpage | jq -c --raw-input --slurp 'split("\n")| .[0:-1]') + dockerFiles=$(find examples -name Dockerfile | grep -v -E '(mergepdf|coverpage)' | jq -c --raw-input --slurp 'split("\n")| .[0:-1]') echo "dockerFiles=$dockerFiles" >> $GITHUB_OUTPUT env: GITHUB_REF: ${{ github.ref }} @@ -86,7 +86,7 @@ jobs: needs: [build-push] strategy: matrix: - dockerFile: ["examples/coverpage/Dockerfile"] + dockerFile: ["examples/coverpage/Dockerfile", "examples/mergepdf/Dockerfile"] uses: ./.github/workflows/build-push.yml with: dockerFile: ${{ matrix.dockerFile }} diff --git a/ci/k8s/ingress.yaml b/ci/k8s/ingress.yaml index 9b28e43..71b5227 100644 --- a/ci/k8s/ingress.yaml +++ b/ci/k8s/ingress.yaml @@ -74,3 +74,10 @@ spec: name: islandora-cache-warmer port: number: 8080 + - path: /mergepdf(/|$)(.*) + pathType: Prefix + backend: + service: + name: islandora-mergepdf + port: + number: 8080 diff --git a/ci/k8s/mergepdf.yaml b/ci/k8s/mergepdf.yaml new file mode 100644 index 0000000..7c6b27c --- /dev/null +++ b/ci/k8s/mergepdf.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: islandora-mergepdf +spec: + selector: + app: islandora-mergepdf + ports: + - protocol: TCP + port: 8887 + targetPort: 8080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: islandora-mergepdf +spec: + replicas: 3 + selector: + matchLabels: + app: islandora-mergepdf + template: + metadata: + labels: + app: islandora-mergepdf + spec: + containers: + - name: scyllaridae-mergepdf + image: __DOCKER_REPOSITORY__/scyllaridae-mergepdf:main + imagePullPolicy: IfNotPresent + resources: + requests: + memory: "128Mi" + cpu: "500m" + limits: + memory: "1Gi" + ports: + - containerPort: 8080 + hostPort: 8887 + readinessProbe: + httpGet: + path: /healthcheck + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 diff --git a/examples/mergepdf/Dockerfile b/examples/mergepdf/Dockerfile new file mode 100644 index 0000000..edd437b --- /dev/null +++ b/examples/mergepdf/Dockerfile @@ -0,0 +1,24 @@ +ARG TAG=main +ARG DOCKER_REPOSITORY=local +FROM ${DOCKER_REPOSITORY}/scyllaridae-imagemagick:${TAG} AS scyllaridae + +RUN apk update && \ + apk add --no-cache \ + ghostscript==10.04.0-r0 \ + jq==1.7.1-r0 \ + leptonica-dev==1.84.1-r0 \ + tesseract-ocr==5.3.4-r0 \ + tesseract-ocr-data-eng==5.3.4-r0 \ + tesseract-ocr-data-fra==5.3.4-r0 \ + tesseract-ocr-data-spa==5.3.4-r0 \ + tesseract-ocr-data-ita==5.3.4-r0 \ + tesseract-ocr-data-por==5.3.4-r0 \ + tesseract-ocr-data-hin==5.3.4-r0 \ + tesseract-ocr-data-deu==5.3.4-r0 \ + tesseract-ocr-data-jpn==5.3.4-r0 \ + tesseract-ocr-data-rus==5.3.4-r0 \ + poppler-utils==24.02.0-r1 + +COPY . /app + +ENTRYPOINT ["/app/docker-entrypoint.sh"] diff --git a/examples/mergepdf/cmd.sh b/examples/mergepdf/cmd.sh new file mode 100755 index 0000000..44ccb65 --- /dev/null +++ b/examples/mergepdf/cmd.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -eou pipefail + +TMP_DIR=$(mktemp -d) +I=0 + +# iterate over all images in the IIIF manifest +curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | while read -r URL; do + # resize image to max 1000px width + curl -s "$URL" | convert -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1 + + # make an OCR'd PDF from the image + tesseract "$TMP_DIR/img_$I" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1 + + I="$(( I + 1))" +done + +mapfile -t FILES < <(ls -rt "$TMP_DIR"/img_*.pdf) +gs -dBATCH \ + -dNOPAUSE \ + -dQUIET \ + -sDEVICE=pdfwrite \ + -dPDFA \ + -dNOOUTERSAVE \ + -dAutoRotatePages=/None \ + -sOutputFile="$TMP_DIR/ocr.pdf" \ + "${FILES[@]}" + +cat "$TMP_DIR/ocr.pdf" +rm -rf "$TMP_DIR" diff --git a/examples/mergepdf/scyllaridae.yml b/examples/mergepdf/scyllaridae.yml new file mode 100644 index 0000000..12cc2c6 --- /dev/null +++ b/examples/mergepdf/scyllaridae.yml @@ -0,0 +1,7 @@ +allowedMimeTypes: + - "*" +cmdByMimeType: + default: + cmd: /app/cmd.sh + args: + - "%canonical" diff --git a/examples/parry/scyllaridae.yml b/examples/parry/scyllaridae.yml index f784813..4b1c553 100644 --- a/examples/parry/scyllaridae.yml +++ b/examples/parry/scyllaridae.yml @@ -6,3 +6,6 @@ queueMiddlewares: url: http://cache-warmer:8080 consumers: 3 noPut: true + - queueName: islandora-merge-pdf + url: http://mergepdf:8080 + consumers: 3