Skip to content

Commit

Permalink
[minor] add microservice to aggregate all children into a PDF (#46)
Browse files Browse the repository at this point in the history
* [minor] add microservice to aggregate all children into a PDF

* COPY

* Silence tesseract and convert

* max width 1000px

* Add k8s yaml
  • Loading branch information
joecorall authored Oct 10, 2024
1 parent 3e9a730 commit 6f6c259
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/lint-test-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
- name: Find docker files
id: images
run: |
dockerFiles=$(find examples -name Dockerfile | grep -v coverpage | jq -c --raw-input --slurp 'split("\n")| .[0:-1]')
dockerFiles=$(find examples -name Dockerfile | grep -v -E '(mergepdf|coverpage)' | jq -c --raw-input --slurp 'split("\n")| .[0:-1]')
echo "dockerFiles=$dockerFiles" >> $GITHUB_OUTPUT
env:
GITHUB_REF: ${{ github.ref }}
Expand All @@ -86,7 +86,7 @@ jobs:
needs: [build-push]
strategy:
matrix:
dockerFile: ["examples/coverpage/Dockerfile"]
dockerFile: ["examples/coverpage/Dockerfile", "examples/mergepdf/Dockerfile"]
uses: ./.github/workflows/build-push.yml
with:
dockerFile: ${{ matrix.dockerFile }}
Expand Down
7 changes: 7 additions & 0 deletions ci/k8s/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,10 @@ spec:
name: islandora-cache-warmer
port:
number: 8080
- path: /mergepdf(/|$)(.*)
pathType: Prefix
backend:
service:
name: islandora-mergepdf
port:
number: 8080
46 changes: 46 additions & 0 deletions ci/k8s/mergepdf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
apiVersion: v1
kind: Service
metadata:
name: islandora-mergepdf
spec:
selector:
app: islandora-mergepdf
ports:
- protocol: TCP
port: 8887
targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: islandora-mergepdf
spec:
replicas: 3
selector:
matchLabels:
app: islandora-mergepdf
template:
metadata:
labels:
app: islandora-mergepdf
spec:
containers:
- name: scyllaridae-mergepdf
image: __DOCKER_REPOSITORY__/scyllaridae-mergepdf:main
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "128Mi"
cpu: "500m"
limits:
memory: "1Gi"
ports:
- containerPort: 8080
hostPort: 8887
readinessProbe:
httpGet:
path: /healthcheck
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
24 changes: 24 additions & 0 deletions examples/mergepdf/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
ARG TAG=main
ARG DOCKER_REPOSITORY=local
FROM ${DOCKER_REPOSITORY}/scyllaridae-imagemagick:${TAG} AS scyllaridae

RUN apk update && \
apk add --no-cache \
ghostscript==10.04.0-r0 \
jq==1.7.1-r0 \
leptonica-dev==1.84.1-r0 \
tesseract-ocr==5.3.4-r0 \
tesseract-ocr-data-eng==5.3.4-r0 \
tesseract-ocr-data-fra==5.3.4-r0 \
tesseract-ocr-data-spa==5.3.4-r0 \
tesseract-ocr-data-ita==5.3.4-r0 \
tesseract-ocr-data-por==5.3.4-r0 \
tesseract-ocr-data-hin==5.3.4-r0 \
tesseract-ocr-data-deu==5.3.4-r0 \
tesseract-ocr-data-jpn==5.3.4-r0 \
tesseract-ocr-data-rus==5.3.4-r0 \
poppler-utils==24.02.0-r1

COPY . /app

ENTRYPOINT ["/app/docker-entrypoint.sh"]
31 changes: 31 additions & 0 deletions examples/mergepdf/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env bash

set -eou pipefail

TMP_DIR=$(mktemp -d)
I=0

# iterate over all images in the IIIF manifest
curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | while read -r URL; do
# resize image to max 1000px width
curl -s "$URL" | convert -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1

# make an OCR'd PDF from the image
tesseract "$TMP_DIR/img_$I" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1

I="$(( I + 1))"
done

mapfile -t FILES < <(ls -rt "$TMP_DIR"/img_*.pdf)
gs -dBATCH \
-dNOPAUSE \
-dQUIET \
-sDEVICE=pdfwrite \
-dPDFA \
-dNOOUTERSAVE \
-dAutoRotatePages=/None \
-sOutputFile="$TMP_DIR/ocr.pdf" \
"${FILES[@]}"

cat "$TMP_DIR/ocr.pdf"
rm -rf "$TMP_DIR"
7 changes: 7 additions & 0 deletions examples/mergepdf/scyllaridae.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
allowedMimeTypes:
- "*"
cmdByMimeType:
default:
cmd: /app/cmd.sh
args:
- "%canonical"
3 changes: 3 additions & 0 deletions examples/parry/scyllaridae.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ queueMiddlewares:
url: http://cache-warmer:8080
consumers: 3
noPut: true
- queueName: islandora-merge-pdf
url: http://mergepdf:8080
consumers: 3

0 comments on commit 6f6c259

Please sign in to comment.