Skip to content

Commit

Permalink
Merge pull request #6 from dlcs/feature/dimension_fix
Browse files Browse the repository at this point in the history
Better handling of fetching page dimensions
  • Loading branch information
donaldgray authored Nov 21, 2023
2 parents e9efbbd + 2a0c270 commit 3a3ab75
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
8 changes: 7 additions & 1 deletion app/pdf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,16 @@ def _get_pdf_page_attributes(pdf: str) -> dict:

pdf_addrs = {}
for i in range(len(doc)):
found_dimensions = False
for img in doc.get_page_images(i):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
pdf_addrs[i] = [pix.w, pix.h]
found_dimensions = True

if not found_dimensions:
page = doc[i]
pdf_addrs[i] = [int(page.rect.width), int(page.rect.height)]

return pdf_addrs

Expand Down Expand Up @@ -236,6 +242,6 @@ def generate_guid():

if __name__ == "__main__":
args = sys.argv[1:]
processor = PDFProcessor(args[0], args[1])
processor = PDFProcessor(args[0], args[1], args[2])
processor.extract_alto()
print(processor.generated_alto)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jmespath==0.10.0
logzero==1.7.0
lxml==4.9.3
pycryptodome==3.12.0
PyMuPDF==1.22.5
PyMuPDF==1.23.6
python-dateutil==2.8.2
requests==2.27.1
s3transfer==0.7.0
Expand Down

0 comments on commit 3a3ab75

Please sign in to comment.