From ac524f02adae0c42a726aee107f0c5f532e1cfdd Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 30 Sep 2024 08:23:34 -0400 Subject: [PATCH] Iteration over the page tree (#5) * feat: iterate over page tree * fix: put page creation where it belongs circular imports are not a fatailty * fix: circular references also unnecessary --- playa/exceptions.py | 4 ++ playa/pdfdocument.py | 90 +++++++++++++++++++++++++++++ playa/pdfpage.py | 118 +------------------------------------- tests/test_open.py | 5 +- tests/test_pdfdocument.py | 6 ++ 5 files changed, 105 insertions(+), 118 deletions(-) diff --git a/playa/exceptions.py b/playa/exceptions.py index 6814ca3b..0698a4c7 100644 --- a/playa/exceptions.py +++ b/playa/exceptions.py @@ -71,6 +71,10 @@ class PDFNoPageLabels(PDFException): pass +class PDFNoPageTree(PDFException): + pass + + class PDFDestinationNotFound(PDFException): pass diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py index 64c92384..f2040963 100644 --- a/playa/pdfdocument.py +++ b/playa/pdfdocument.py @@ -33,6 +33,7 @@ PDFKeyError, PDFNoOutlines, PDFNoPageLabels, + PDFNoPageTree, PDFNoValidXRef, PDFObjectNotFound, PDFPasswordIncorrect, @@ -42,6 +43,7 @@ from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser from playa.pdftypes import ( DecipherCallable, + PDFObjRef, PDFStream, decipher_all, dict_value, @@ -52,6 +54,7 @@ uint_value, ) from playa.psparser import KWD, LIT, literal_name +from playa.pdfpage import PDFPage from playa.utils import ( choplist, decode_text, @@ -68,7 +71,10 @@ LITERAL_OBJSTM = LIT("ObjStm") LITERAL_XREF = LIT("XRef") LITERAL_CATALOG = LIT("Catalog") +LITERAL_PAGE = LIT("Page") +LITERAL_PAGES = LIT("Pages") KEYWORD_OBJ = KWD(b"obj") +INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} class PDFBaseXRef: @@ -907,6 +913,90 @@ def get_page_labels(self) -> Iterator[str]: return page_labels.labels + PageType = Dict[Any, Dict[Any, Any]] + + def pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]: + """Find pages from the cross-reference tables if the page tree + is missing (note that this only happens in invalid PDFs, but + it happens.) + + Returns an iterator over (objid, dict) pairs. + """ + for xref in self.xrefs: + for object_id in xref.get_objids(): + try: + obj = self.getobj(object_id) + if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: + yield object_id, obj + except PDFObjectNotFound: + pass + + def page_tree(self) -> Iterator[Tuple[int, PageType]]: + """Iterate over the flattened page tree in reading order, propagating + inheritable attributes. Returns an iterator over (objid, dict) pairs. + + Will raise PDFNoPageTree if there is no page tree. + """ + if "Pages" not in self.catalog: + raise PDFNoPageTree("No 'Pages' entry in catalog") + stack = [(self.catalog["Pages"], self.catalog)] + visited = set() + while stack: + (obj, parent) = stack.pop() + if isinstance(obj, PDFObjRef): + # The PDF specification *requires* both the Pages + # element of the catalog and the entries in Kids in + # the page tree to be indirect references. + object_id = obj.objid + elif isinstance(obj, int): + # Should not happen in a valid PDF, but probably does? + log.warning("Page tree contains bare integer: %r in %r", obj, parent) + object_id = obj + else: + log.warning("Page tree contains unknown object: %r", obj) + page_object = dict_value(self.getobj(object_id)) + + # Avoid recursion errors by keeping track of visited nodes + # (again, this should never actually happen in a valid PDF) + if object_id in visited: + log.warning("Circular reference %r in page tree", obj) + continue + visited.add(object_id) + + # Propagate inheritable attributes + object_properties = page_object.copy() + for k, v in parent.items(): + if k in INHERITABLE_PAGE_ATTRS and k not in object_properties: + object_properties[k] = v + + # Recurse, depth-first + object_type = object_properties.get("Type") + if object_type is None and not settings.STRICT: # See #64 + object_type = object_properties.get("type") + if object_type is LITERAL_PAGES and "Kids" in object_properties: + log.debug("Pages: Kids=%r", object_properties["Kids"]) + for child in reversed(list_value(object_properties["Kids"])): + stack.append((child, object_properties)) + elif object_type is LITERAL_PAGE: + log.debug("Page: %r", object_properties) + yield object_id, object_properties + + def get_pages(self) -> Iterator[PDFPage]: + """Get an iterator over PDFPage objects, which contain + information about the pages in the document. + """ + try: + page_labels: Iterator[Optional[str]] = self.get_page_labels() + except PDFNoPageLabels: + page_labels = itertools.repeat(None) + try: + page_tree = self.page_tree() + except PDFNoPageTree: + page_tree = self.pages_from_xrefs() + + for (objid, properties), label in zip(page_tree, page_labels): + yield PDFPage(objid, properties, label) + def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any: try: names = dict_value(self.catalog["Names"]) diff --git a/playa/pdfpage.py b/playa/pdfpage.py index f2064aea..02b2335a 100644 --- a/playa/pdfpage.py +++ b/playa/pdfpage.py @@ -1,18 +1,8 @@ -import itertools import logging -from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple +from typing import Dict, List, Optional -from playa import settings -from playa.exceptions import ( - PDFNoPageLabels, - PDFObjectNotFound, - PDFTextExtractionNotAllowed, - PDFValueError, -) -from playa.pdfdocument import ( - PDFDocument, -) -from playa.pdftypes import dict_value, int_value, list_value, resolve1 +from playa.exceptions import PDFValueError +from playa.pdftypes import dict_value, int_value, resolve1 from playa.psparser import LIT from playa.utils import parse_rect @@ -32,7 +22,6 @@ class PDFPage: Attributes ---------- - doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. contents: a list of PDFStream objects that represents the page content. @@ -49,7 +38,6 @@ class PDFPage: def __init__( self, - doc: PDFDocument, pageid: object, attrs: object, label: Optional[str], @@ -61,7 +49,6 @@ def __init__( attrs: a dictionary of page attributes. label: page label string. """ - self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.label = label @@ -100,102 +87,3 @@ def __init__( def __repr__(self) -> str: return f"" - - INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} - - @classmethod - def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: - def depth_first_search( - obj: Any, - parent: Dict[str, Any], - visited: Optional[Set[Any]] = None, - ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]: - if isinstance(obj, int): - object_id = obj - object_properties = dict_value(document.getobj(object_id)).copy() - else: - # This looks broken. obj.objid means obj could be either - # PDFObjRef or PDFStream, but neither is valid for dict_value. - object_id = obj.objid # type: ignore[attr-defined] - object_properties = dict_value(obj).copy() - - # Avoid recursion errors by keeping track of visited nodes - if visited is None: - visited = set() - if object_id in visited: - return - visited.add(object_id) - - for k, v in parent.items(): - if k in cls.INHERITABLE_ATTRS and k not in object_properties: - object_properties[k] = v - - object_type = object_properties.get("Type") - if object_type is None and not settings.STRICT: # See #64 - object_type = object_properties.get("type") - - if object_type is LITERAL_PAGES and "Kids" in object_properties: - log.debug("Pages: Kids=%r", object_properties["Kids"]) - for child in list_value(object_properties["Kids"]): - yield from depth_first_search(child, object_properties, visited) - - elif object_type is LITERAL_PAGE: - log.debug("Page: %r", object_properties) - yield (object_id, object_properties) - - try: - page_labels: Iterator[Optional[str]] = document.get_page_labels() - except PDFNoPageLabels: - page_labels = itertools.repeat(None) - - pages = False - if "Pages" in document.catalog: - objects = depth_first_search(document.catalog["Pages"], document.catalog) - for objid, tree in objects: - yield cls(document, objid, tree, next(page_labels)) - pages = True - if not pages: - # fallback when /Pages is missing. - for xref in document.xrefs: - for objid in xref.get_objids(): - try: - obj = document.getobj(objid) - if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: - yield cls(document, objid, obj, next(page_labels)) - except PDFObjectNotFound: - pass - - @classmethod - def get_pages( - cls, - fp: BinaryIO, - pagenos: Optional[Container[int]] = None, - maxpages: int = 0, - password: str = "", - caching: bool = True, - check_extractable: bool = False, - ) -> Iterator["PDFPage"]: - # Create a PDF document object that stores the document structure. - doc = PDFDocument(fp, password=password) - # Check if the document allows text extraction. - # If not, warn the user and proceed. - if not doc.is_extractable: - if check_extractable: - error_msg = "Text extraction is not allowed: %r" % fp - raise PDFTextExtractionNotAllowed(error_msg) - else: - warning_msg = ( - "The PDF %r contains a metadata field " - "indicating that it should not allow " - "text extraction. Ignoring this field " - "and proceeding. Use the check_extractable " - "if you want to raise an error in this case" % fp - ) - log.warning(warning_msg) - # Process each page contained in the document. - for pageno, page in enumerate(cls.create_pages(doc)): - if pagenos and (pageno not in pagenos): - continue - yield page - if maxpages and maxpages <= pageno + 1: - break diff --git a/tests/test_open.py b/tests/test_open.py index 163e0c93..195fbf8a 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -11,7 +11,6 @@ # These APIs will go away soon from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager -from playa.pdfpage import PDFPage TESTDIR = Path(__file__).parent.parent / "samples" ALLPDFS = TESTDIR.glob("**/*.pdf") @@ -46,7 +45,7 @@ def test_inline_data(): rsrc = PDFResourceManager() agg = PDFPageAggregator(rsrc, pageno=1) interp = PDFPageInterpreter(rsrc, agg) - page = next(PDFPage.create_pages(doc)) + page = next(doc.get_pages()) interp.process_page(page) @@ -56,7 +55,7 @@ def test_multiple_contents(): rsrc = PDFResourceManager() agg = PDFPageAggregator(rsrc, pageno=1) interp = PDFPageInterpreter(rsrc, agg) - page = next(PDFPage.create_pages(doc)) + page = next(doc.get_pages()) assert len(page.contents) > 1 interp.process_page(page) diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index e82f576f..d790e09f 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -36,3 +36,9 @@ def test_page_labels(): with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc: labels = [label for _, label in zip(range(10), doc.get_page_labels())] assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"] + + +def test_pages(): + with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc: + page_objects = list(doc.get_pages()) + assert len(page_objects) == 15