Skip to content

Commit

Permalink
Merge pull request #4 from dlcs/feature/handle_pdf_err
Browse files Browse the repository at this point in the history
Handle 404 when fetching PDF + bump dependencies
  • Loading branch information
donaldgray authored Aug 17, 2023
2 parents b4e0633 + 9f8a88a commit 52c92b2
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 24 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ jobs:

- name: Set up Docker Buildx
id: docker-setup-buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v2

- name: Docker metadata
id: docker-meta
uses: docker/metadata-action@v3
uses: docker/metadata-action@v4
with:
images: ghcr.io/dlcs/pdf-to-alto
tags: |
Expand All @@ -36,15 +36,15 @@ jobs:
- name: Login to GitHub Container Registry
id: docker-login
uses: docker/login-action@v1
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build and push
id: docker-build-push
uses: docker/build-push-action@v2
uses: docker/build-push-action@v4
with:
context: .
builder: ${{ steps.docker-setup-buildx.outputs.name }}
Expand Down
22 changes: 6 additions & 16 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,19 @@
FROM debian:bullseye as build

# avoid issue with packages requiring interaction (e.g. tzdata)
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y wget cmake clang git autoconf pkg-config

# Change submodule to https as we're cloning only. Avoids issues with ssh
# 8bb209c0c21476ee904a is 0.4 with some bugfixes
RUN mkdir /home/pdfalto && cd /home/pdfalto \
&& git clone https://github.com/kermitt2/pdfalto.git && cd pdfalto && git checkout 8bb209c0c21476ee904a && ./install_deps.sh \
&& git submodule set-url xpdf-4.03 https://github.com/kermitt2/xpdf-4.03.git && git submodule update --init --recursive \
&& cmake ./ && make

FROM python:3.9-slim
FROM python:3.11-slim

LABEL maintainer="Donald Gray <[email protected]>"
LABEL org.opencontainers.image.source=https://github.com/dlcs/pdf-to-alto
LABEL org.opencontainers.image.description="Extract ALTO from PDF"

COPY --from=build /home/pdfalto/pdfalto/pdfalto /usr/bin/pdfalto
COPY /deps/pdfalto /usr/bin/pdfalto

COPY requirements.txt /opt/app/requirements.txt

WORKDIR /opt/app
RUN pip install --no-cache-dir -r requirements.txt
COPY . /opt/app

COPY app /opt/app/app
COPY monitor.py /opt/app/monitor.py
COPY wait-for-localstack.sh /opt/app/wait-for-localstack.sh

RUN chmod +x wait-for-localstack.sh

Expand Down
2 changes: 2 additions & 0 deletions app/pdf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def _create_work_folder(self):
def _download_pdf(self, target_file: Path):
try:
download_request = requests.get(self.pdf_location, stream=True)
download_request.raise_for_status()

with open(target_file, "wb") as file:
for chunk in download_request.iter_content(DOWNLOAD_CHUNK_SIZE):
file.write(chunk)
Expand Down
4 changes: 2 additions & 2 deletions compose/localstack/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
FROM localstack/localstack
COPY seed-resources.sh /docker-entrypoint-initaws.d/
FROM localstack/localstack:2.2.0
COPY seed-resources.sh /etc/localstack/init/ready.d/
Binary file added deps/pdfalto
Binary file not shown.
18 changes: 18 additions & 0 deletions deps/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Dependencies

pdfalto is a required binary from https://github.com/kermitt2/pdfalto.git. Built using:

```dockerfile
FROM debian:bullseye as build

# avoid issue with packages requiring interaction (e.g. tzdata)
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y wget cmake clang git autoconf pkg-config

# Change submodule to https as we're cloning only. Avoids issues with ssh
# 8bb209c0c21476ee904a is 0.4 with some bugfixes
RUN mkdir /home/pdfalto && cd /home/pdfalto \
&& git clone https://github.com/kermitt2/pdfalto.git && cd pdfalto && git checkout 8bb209c0c21476ee904a && ./install_deps.sh \
&& git submodule set-url xpdf-4.03 https://github.com/kermitt2/xpdf-4.03.git && git submodule update --init --recursive \
&& cmake ./ && make
```
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ colorama==0.4.4
idna==3.3
jmespath==0.10.0
logzero==1.7.0
lxml==4.7.1
lxml==4.9.3
pycryptodome==3.12.0
PyMuPDF==1.19.4
PyMuPDF==1.22.5
python-dateutil==2.8.2
requests==2.27.1
s3transfer==0.5.0
Expand Down

0 comments on commit 52c92b2

Please sign in to comment.