diff --git a/app/pdf_processor.py b/app/pdf_processor.py index d29ba65..fb8ba56 100644 --- a/app/pdf_processor.py +++ b/app/pdf_processor.py @@ -130,10 +130,16 @@ def _get_pdf_page_attributes(pdf: str) -> dict: pdf_addrs = {} for i in range(len(doc)): + found_dimensions = False for img in doc.get_page_images(i): xref = img[0] pix = fitz.Pixmap(doc, xref) pdf_addrs[i] = [pix.w, pix.h] + found_dimensions = True + + if not found_dimensions: + page = doc[i] + pdf_addrs[i] = [int(page.rect.width), int(page.rect.height)] return pdf_addrs @@ -236,6 +242,6 @@ def generate_guid(): if __name__ == "__main__": args = sys.argv[1:] - processor = PDFProcessor(args[0], args[1]) + processor = PDFProcessor(args[0], args[1], args[2]) processor.extract_alto() print(processor.generated_alto) diff --git a/requirements.txt b/requirements.txt index a385a68..9da8f66 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ jmespath==0.10.0 logzero==1.7.0 lxml==4.9.3 pycryptodome==3.12.0 -PyMuPDF==1.22.5 +PyMuPDF==1.23.6 python-dateutil==2.8.2 requests==2.27.1 s3transfer==0.7.0