Skip to content

Commit

Permalink
Step 2 in updating the API (#10)
Browse files Browse the repository at this point in the history
* chore: ruff it up

* refactor!: begin the de-Javafication of the API

* refactor!: the great pythonification continues

* fix: make playa.open behave as expected in REPL for instance

* feat!: reimplement dests/names as NameTree

* docs: note a rather large problem with dests()

* fix: correct the types for name trees and destinations

* fix: seems that named destinations are "text strings"

* fix: halfway fix literal names (will make them `str` always soon)

* fix(test):  needed this file

* fix: properly enforce bytes for KWD and str for LIT

* fix: try to enforce KWD/LIT types better

* refactor!: why have a base class if you do not actually use it omg

* feat: why are we falling back

* feat: benchmark

* fix: better messages for xref

* fix: no, not TESTDIR

* refactor!: make page_objects normal method again

* feat: slightly more useful outlines property

* chore: ruff n stuff

* feat: make dests more interesting (but not interesting neough)

* feat: use weakref for PDFObjRef backreferences

* test: test weak refs for obj refs

* fix: allow str as path too

* fix: restore pdfminer compatibility for now

* feat: expose stroking/non-stroking color on LTChar as pdfplumber wants

* feat: reuse weak reference

* refactor!: We are not a layout analyzer
  • Loading branch information
dhdaines authored Oct 23, 2024
1 parent a0783c7 commit a71c139
Show file tree
Hide file tree
Showing 24 changed files with 637 additions and 922 deletions.
23 changes: 13 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# PLAYA is a LAYout Analyzer 🏖️
# PLAYA Ain't a LAYout Analyzer 🏖️

## About

This is not an experimental fork of
[pdfminer.six](https://github.com/pdfminer/pdfminer.six). Well, it's
kind of an experimental fork of pdfminer.six. The idea is to extract
just the part of pdfminer.six that gets used these days, namely the
layout analysis and low-level PDF access, see if it can be
reimplemented using other libraries such as pypdf or pikepdf, and make
its API more fun to use.
just the part of pdfminer.six that gets used by
[pdfplumber](https://github.com/jsvine/pdfplumber), namely the
low-level PDF access, optimize it for speed, see if it can be
reimplemented using other libraries such as pypdf or pikepdf,
benchmark it against those libraries, and improve its API.

There are already too many PDF libraries, unfortunately none of which
does everything that everybody wants it to do, and we probably don't
Expand All @@ -21,11 +22,13 @@ would be specifically one of these things and nothing else:
metadata.
2. Obtaining the absolute position and attributes of every character,
line, path, and image in every page of a PDF document.

Since most people *do not want to do these things*, ideally, this will
get merged into some other library, perhaps
[pypdf](https://github.com/py-pdf/pypdf). Did I mention this is
experimental?

Notably this does *not* include the largely undocumented heuristic
"layout analysis" done by pdfminer.six, because it is quite difficult
to understand due to a Java-damaged API based on deeply nested class
hierarchies, and because layout analysis is best done
probabilistically/visually. Also, pdfplumber does its own, much
nicer, layout analysis.

## Acknowledgement

Expand Down
13 changes: 6 additions & 7 deletions playa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,17 @@
"""

import builtins
from contextlib import contextmanager
from os import PathLike
from typing import Iterator
from typing import Union

from playa.pdfdocument import PDFDocument

__version__ = "0.0.1"


@contextmanager
def open(path: PathLike, password: str = "") -> Iterator[PDFDocument]: # noqa: A001
def open(path: Union[PathLike, str], password: str = "") -> PDFDocument:
"""Open a PDF document from a path on the filesystem."""
with builtins.open(path, "rb") as infh:
with PDFDocument(infh, password) as pdf:
yield pdf
fp = builtins.open(path, "rb")
pdf = PDFDocument(fp, password)
pdf._fp = fp
return pdf
30 changes: 15 additions & 15 deletions playa/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
)

from playa.layout import (
LAParams,
LTComponent,
LTChar,
LTComponent,
LTCurve,
LTFigure,
LTImage,
Expand All @@ -23,9 +22,9 @@
from playa.pdfdevice import PDFTextDevice
from playa.pdffont import PDFFont, PDFUnicodeNotDefined
from playa.pdfinterp import PDFGraphicState, PDFResourceManager, PDFStackT
from playa.psparser import PSLiteral
from playa.pdfpage import PDFPage
from playa.pdftypes import PDFStream
from playa.psparser import PSLiteral
from playa.utils import (
Matrix,
PathSegment,
Expand All @@ -49,11 +48,9 @@ def __init__(
self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: Optional[LAParams] = None,
) -> None:
PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno
self.laparams = laparams
self._stack: List[LTLayoutContainer] = []

def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
Expand All @@ -66,8 +63,6 @@ def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
def end_page(self, page: PDFPage) -> None:
assert not self._stack, str(len(self._stack))
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
if self.laparams is not None:
self.cur_item.analyze(self.laparams)
self.pageno += 1
self.receive_layout(self.cur_item)

Expand Down Expand Up @@ -178,7 +173,8 @@ def paint_path(
gstate.ncolor,
original_path=transformed_path,
dashing_style=gstate.dash,
ncs=ncs, scs=scs
ncs=ncs,
scs=scs,
)
self.add_item(line)

Expand All @@ -200,7 +196,8 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
ncs,
scs,
)
self.add_item(rect)
else:
Expand All @@ -214,7 +211,8 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
ncs,
scs,
)
self.add_item(curve)
else:
Expand All @@ -228,7 +226,8 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
ncs,
scs,
)
self.add_item(curve)

Expand All @@ -240,8 +239,8 @@ def render_char(
scaling: float,
rise: float,
cid: int,
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> float:
try:
Expand All @@ -260,9 +259,11 @@ def render_char(
text,
textwidth,
textdisp,
graphicstate,
ncs,
graphicstate,
scs,
graphicstate.scolor,
graphicstate.ncolor,
)
self.add_item(item)
return item.adv
Expand All @@ -280,9 +281,8 @@ def __init__(
self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: Optional[LAParams] = None,
) -> None:
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno)
self.result: Optional[LTPage] = None

def receive_layout(self, ltpage: LTPage) -> None:
Expand Down
74 changes: 54 additions & 20 deletions playa/data_structures.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from typing import Any, Dict, Iterator, List, Tuple
from typing import Any, Dict, Iterator, Tuple, Union

from playa import settings
from playa.pdfparser import PDFSyntaxError
from playa.pdftypes import dict_value, int_value, list_value
from playa.pdftypes import dict_value, int_value, list_value, str_value
from playa.utils import choplist


def walk_number_tree(tree: Dict[str, Any]) -> Iterator[Tuple[int, Any]]:
def walk_number_tree(
tree: Dict[str, Any], key: Union[int, None] = None
) -> Iterator[Tuple[int, Any]]:
stack = [tree]
while stack:
item = dict_value(stack.pop())
if key is not None and "Limits" in item:
(k1, k2) = list_value(item["Limits"])
if key < k1 or k2 < key:
continue
if "Nums" in item:
for k, v in choplist(2, list_value(item["Nums"])):
yield int_value(k), v
Expand All @@ -29,26 +33,56 @@ def __init__(self, obj: Any):
def __iter__(self) -> Iterator[Tuple[int, Any]]:
return walk_number_tree(self._obj)

def __contains__(self, num) -> bool:
for idx, val in self:
def __contains__(self, num: int) -> bool:
for idx, val in walk_number_tree(self._obj, num):
if idx == num:
return True
return False

def __getitem__(self, num) -> Any:
for idx, val in self:
def __getitem__(self, num: int) -> Any:
for idx, val in walk_number_tree(self._obj, num):
if idx == num:
return val
raise IndexError(f"Number {num} not in tree")

@property
def values(self) -> List[Tuple[int, Any]]:
values = list(self)
# NOTE: They are supposed to be sorted! (but, I suppose, often aren't)
if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError("Number tree elements are out of order")
else:
values.sort(key=lambda t: t[0])

return values

def walk_name_tree(
tree: Dict[str, Any], key: Union[bytes, None] = None
) -> Iterator[Tuple[bytes, Any]]:
stack = [tree]
while stack:
item = dict_value(stack.pop())
if key is not None and "Limits" in item:
(k1, k2) = list_value(item["Limits"])
if key < k1 or k2 < key:
continue
if "Names" in item:
for k, v in choplist(2, list_value(item["Names"])):
yield str_value(k), v
if "Kids" in item:
stack.extend(reversed(list_value(item["Kids"])))


class NameTree:
"""A PDF name tree.
See Section 7.9.6 of the PDF 1.7 Reference.
"""

def __init__(self, obj: Any):
self._obj = dict_value(obj)

def __iter__(self) -> Iterator[Tuple[bytes, Any]]:
return walk_name_tree(self._obj, None)

def __contains__(self, name: bytes) -> bool:
for idx, val in self:
if idx == name:
return True
return False

def __getitem__(self, name: bytes) -> Any:
for idx, val in self:
if idx == name:
return val
raise IndexError("Name %r not in tree" % name)
8 changes: 0 additions & 8 deletions playa/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,6 @@ class PDFValueError(PDFException, ValueError):
pass


class PDFObjectNotFound(PDFException):
pass


class PDFNotImplementedError(PDFException, NotImplementedError):
pass

Expand Down Expand Up @@ -79,10 +75,6 @@ class PDFNoStructTree(PDFException):
pass


class PDFDestinationNotFound(PDFException):
pass


class PDFEncryptionError(PDFException):
pass

Expand Down
Loading

0 comments on commit a71c139

Please sign in to comment.