Skip to content

Commit

Permalink
Iteration over the page tree (#5)
Browse files Browse the repository at this point in the history
* feat: iterate over page tree

* fix: put page creation where it belongs

circular imports are not a fatailty

* fix: circular references also unnecessary
  • Loading branch information
dhdaines authored Sep 30, 2024
1 parent 91a8c37 commit ac524f0
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 118 deletions.
4 changes: 4 additions & 0 deletions playa/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ class PDFNoPageLabels(PDFException):
pass


class PDFNoPageTree(PDFException):
pass


class PDFDestinationNotFound(PDFException):
pass

Expand Down
90 changes: 90 additions & 0 deletions playa/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
PDFKeyError,
PDFNoOutlines,
PDFNoPageLabels,
PDFNoPageTree,
PDFNoValidXRef,
PDFObjectNotFound,
PDFPasswordIncorrect,
Expand All @@ -42,6 +43,7 @@
from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
from playa.pdftypes import (
DecipherCallable,
PDFObjRef,
PDFStream,
decipher_all,
dict_value,
Expand All @@ -52,6 +54,7 @@
uint_value,
)
from playa.psparser import KWD, LIT, literal_name
from playa.pdfpage import PDFPage
from playa.utils import (
choplist,
decode_text,
Expand All @@ -68,7 +71,10 @@
LITERAL_OBJSTM = LIT("ObjStm")
LITERAL_XREF = LIT("XRef")
LITERAL_CATALOG = LIT("Catalog")
LITERAL_PAGE = LIT("Page")
LITERAL_PAGES = LIT("Pages")
KEYWORD_OBJ = KWD(b"obj")
INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}


class PDFBaseXRef:
Expand Down Expand Up @@ -907,6 +913,90 @@ def get_page_labels(self) -> Iterator[str]:

return page_labels.labels

PageType = Dict[Any, Dict[Any, Any]]

def pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
"""Find pages from the cross-reference tables if the page tree
is missing (note that this only happens in invalid PDFs, but
it happens.)
Returns an iterator over (objid, dict) pairs.
"""
for xref in self.xrefs:
for object_id in xref.get_objids():
try:
obj = self.getobj(object_id)
if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
yield object_id, obj
except PDFObjectNotFound:
pass

def page_tree(self) -> Iterator[Tuple[int, PageType]]:
"""Iterate over the flattened page tree in reading order, propagating
inheritable attributes. Returns an iterator over (objid, dict) pairs.
Will raise PDFNoPageTree if there is no page tree.
"""
if "Pages" not in self.catalog:
raise PDFNoPageTree("No 'Pages' entry in catalog")
stack = [(self.catalog["Pages"], self.catalog)]
visited = set()
while stack:
(obj, parent) = stack.pop()
if isinstance(obj, PDFObjRef):
# The PDF specification *requires* both the Pages
# element of the catalog and the entries in Kids in
# the page tree to be indirect references.
object_id = obj.objid
elif isinstance(obj, int):
# Should not happen in a valid PDF, but probably does?
log.warning("Page tree contains bare integer: %r in %r", obj, parent)
object_id = obj
else:
log.warning("Page tree contains unknown object: %r", obj)
page_object = dict_value(self.getobj(object_id))

# Avoid recursion errors by keeping track of visited nodes
# (again, this should never actually happen in a valid PDF)
if object_id in visited:
log.warning("Circular reference %r in page tree", obj)
continue
visited.add(object_id)

# Propagate inheritable attributes
object_properties = page_object.copy()
for k, v in parent.items():
if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
object_properties[k] = v

# Recurse, depth-first
object_type = object_properties.get("Type")
if object_type is None and not settings.STRICT: # See #64
object_type = object_properties.get("type")
if object_type is LITERAL_PAGES and "Kids" in object_properties:
log.debug("Pages: Kids=%r", object_properties["Kids"])
for child in reversed(list_value(object_properties["Kids"])):
stack.append((child, object_properties))
elif object_type is LITERAL_PAGE:
log.debug("Page: %r", object_properties)
yield object_id, object_properties

def get_pages(self) -> Iterator[PDFPage]:
"""Get an iterator over PDFPage objects, which contain
information about the pages in the document.
"""
try:
page_labels: Iterator[Optional[str]] = self.get_page_labels()
except PDFNoPageLabels:
page_labels = itertools.repeat(None)
try:
page_tree = self.page_tree()
except PDFNoPageTree:
page_tree = self.pages_from_xrefs()

for (objid, properties), label in zip(page_tree, page_labels):
yield PDFPage(objid, properties, label)

def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
try:
names = dict_value(self.catalog["Names"])
Expand Down
118 changes: 3 additions & 115 deletions playa/pdfpage.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,8 @@
import itertools
import logging
from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
from typing import Dict, List, Optional

from playa import settings
from playa.exceptions import (
PDFNoPageLabels,
PDFObjectNotFound,
PDFTextExtractionNotAllowed,
PDFValueError,
)
from playa.pdfdocument import (
PDFDocument,
)
from playa.pdftypes import dict_value, int_value, list_value, resolve1
from playa.exceptions import PDFValueError
from playa.pdftypes import dict_value, int_value, resolve1
from playa.psparser import LIT
from playa.utils import parse_rect

Expand All @@ -32,7 +22,6 @@ class PDFPage:
Attributes
----------
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
Expand All @@ -49,7 +38,6 @@ class PDFPage:

def __init__(
self,
doc: PDFDocument,
pageid: object,
attrs: object,
label: Optional[str],
Expand All @@ -61,7 +49,6 @@ def __init__(
attrs: a dictionary of page attributes.
label: page label string.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.label = label
Expand Down Expand Up @@ -100,102 +87,3 @@ def __init__(

def __repr__(self) -> str:
return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}

@classmethod
def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
def depth_first_search(
obj: Any,
parent: Dict[str, Any],
visited: Optional[Set[Any]] = None,
) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
if isinstance(obj, int):
object_id = obj
object_properties = dict_value(document.getobj(object_id)).copy()
else:
# This looks broken. obj.objid means obj could be either
# PDFObjRef or PDFStream, but neither is valid for dict_value.
object_id = obj.objid # type: ignore[attr-defined]
object_properties = dict_value(obj).copy()

# Avoid recursion errors by keeping track of visited nodes
if visited is None:
visited = set()
if object_id in visited:
return
visited.add(object_id)

for k, v in parent.items():
if k in cls.INHERITABLE_ATTRS and k not in object_properties:
object_properties[k] = v

object_type = object_properties.get("Type")
if object_type is None and not settings.STRICT: # See #64
object_type = object_properties.get("type")

if object_type is LITERAL_PAGES and "Kids" in object_properties:
log.debug("Pages: Kids=%r", object_properties["Kids"])
for child in list_value(object_properties["Kids"]):
yield from depth_first_search(child, object_properties, visited)

elif object_type is LITERAL_PAGE:
log.debug("Page: %r", object_properties)
yield (object_id, object_properties)

try:
page_labels: Iterator[Optional[str]] = document.get_page_labels()
except PDFNoPageLabels:
page_labels = itertools.repeat(None)

pages = False
if "Pages" in document.catalog:
objects = depth_first_search(document.catalog["Pages"], document.catalog)
for objid, tree in objects:
yield cls(document, objid, tree, next(page_labels))
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in document.xrefs:
for objid in xref.get_objids():
try:
obj = document.getobj(objid)
if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
yield cls(document, objid, obj, next(page_labels))
except PDFObjectNotFound:
pass

@classmethod
def get_pages(
cls,
fp: BinaryIO,
pagenos: Optional[Container[int]] = None,
maxpages: int = 0,
password: str = "",
caching: bool = True,
check_extractable: bool = False,
) -> Iterator["PDFPage"]:
# Create a PDF document object that stores the document structure.
doc = PDFDocument(fp, password=password)
# Check if the document allows text extraction.
# If not, warn the user and proceed.
if not doc.is_extractable:
if check_extractable:
error_msg = "Text extraction is not allowed: %r" % fp
raise PDFTextExtractionNotAllowed(error_msg)
else:
warning_msg = (
"The PDF %r contains a metadata field "
"indicating that it should not allow "
"text extraction. Ignoring this field "
"and proceeding. Use the check_extractable "
"if you want to raise an error in this case" % fp
)
log.warning(warning_msg)
# Process each page contained in the document.
for pageno, page in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno + 1:
break
5 changes: 2 additions & 3 deletions tests/test_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

# These APIs will go away soon
from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager
from playa.pdfpage import PDFPage

TESTDIR = Path(__file__).parent.parent / "samples"
ALLPDFS = TESTDIR.glob("**/*.pdf")
Expand Down Expand Up @@ -46,7 +45,7 @@ def test_inline_data():
rsrc = PDFResourceManager()
agg = PDFPageAggregator(rsrc, pageno=1)
interp = PDFPageInterpreter(rsrc, agg)
page = next(PDFPage.create_pages(doc))
page = next(doc.get_pages())
interp.process_page(page)


Expand All @@ -56,7 +55,7 @@ def test_multiple_contents():
rsrc = PDFResourceManager()
agg = PDFPageAggregator(rsrc, pageno=1)
interp = PDFPageInterpreter(rsrc, agg)
page = next(PDFPage.create_pages(doc))
page = next(doc.get_pages())
assert len(page.contents) > 1
interp.process_page(page)

Expand Down
6 changes: 6 additions & 0 deletions tests/test_pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,9 @@ def test_page_labels():
with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc:
labels = [label for _, label in zip(range(10), doc.get_page_labels())]
assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"]


def test_pages():
with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
page_objects = list(doc.get_pages())
assert len(page_objects) == 15

0 comments on commit ac524f0

Please sign in to comment.