Step 2 in updating the API (#10)

* chore: ruff it up * refactor!: begin the de-Javafication of the API * refactor!: the great pythonification continues * fix: make playa.open behave as expected in REPL for instance * feat!: reimplement dests/names as NameTree * docs: note a rather large problem with dests() * fix: correct the types for name trees and destinations * fix: seems that named destinations are "text strings" * fix: halfway fix literal names (will make them `str` always soon) * fix(test): needed this file * fix: properly enforce bytes for KWD and str for LIT * fix: try to enforce KWD/LIT types better * refactor!: why have a base class if you do not actually use it omg * feat: why are we falling back * feat: benchmark * fix: better messages for xref * fix: no, not TESTDIR * refactor!: make page_objects normal method again * feat: slightly more useful outlines property * chore: ruff n stuff * feat: make dests more interesting (but not interesting neough) * feat: use weakref for PDFObjRef backreferences * test: test weak refs for obj refs * fix: allow str as path too * fix: restore pdfminer compatibility for now * feat: expose stroking/non-stroking color on LTChar as pdfplumber wants * feat: reuse weak reference * refactor!: We are not a layout analyzer
dhdaines · Oct 23, 2024 · a71c139 · a71c139
1 parent a0783c7
commit a71c139
Show file tree

Hide file tree

Showing 24 changed files with 637 additions and 922 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,15 @@
-# PLAYA is a LAYout Analyzer 🏖️
+# PLAYA Ain't a LAYout Analyzer 🏖️
 
 ## About
 
 This is not an experimental fork of
 [pdfminer.six](https://github.com/pdfminer/pdfminer.six).  Well, it's
 kind of an experimental fork of pdfminer.six.  The idea is to extract
-just the part of pdfminer.six that gets used these days, namely the
-layout analysis and low-level PDF access, see if it can be
-reimplemented using other libraries such as pypdf or pikepdf, and make
-its API more fun to use.
+just the part of pdfminer.six that gets used by
+[pdfplumber](https://github.com/jsvine/pdfplumber), namely the
+low-level PDF access, optimize it for speed, see if it can be
+reimplemented using other libraries such as pypdf or pikepdf,
+benchmark it against those libraries, and improve its API.
 
 There are already too many PDF libraries, unfortunately none of which
 does everything that everybody wants it to do, and we probably don't
@@ -21,11 +22,13 @@ would be specifically one of these things and nothing else:
    metadata.
 2. Obtaining the absolute position and attributes of every character,
    line, path, and image in every page of a PDF document.
-
-Since most people *do not want to do these things*, ideally, this will
-get merged into some other library, perhaps
-[pypdf](https://github.com/py-pdf/pypdf).  Did I mention this is
-experimental?
+
+Notably this does *not* include the largely undocumented heuristic
+"layout analysis" done by pdfminer.six, because it is quite difficult
+to understand due to a Java-damaged API based on deeply nested class
+hierarchies, and because layout analysis is best done
+probabilistically/visually.  Also, pdfplumber does its own, much
+nicer, layout analysis.
 
 ## Acknowledgement
 

diff --git a/playa/__init__.py b/playa/__init__.py
@@ -7,18 +7,17 @@
 """
 
 import builtins
-from contextlib import contextmanager
 from os import PathLike
-from typing import Iterator
+from typing import Union
 
 from playa.pdfdocument import PDFDocument
 
 __version__ = "0.0.1"
 
 
-@contextmanager
-def open(path: PathLike, password: str = "") -> Iterator[PDFDocument]:  # noqa: A001
+def open(path: Union[PathLike, str], password: str = "") -> PDFDocument:
     """Open a PDF document from a path on the filesystem."""
-    with builtins.open(path, "rb") as infh:
-        with PDFDocument(infh, password) as pdf:
-            yield pdf
+    fp = builtins.open(path, "rb")
+    pdf = PDFDocument(fp, password)
+    pdf._fp = fp
+    return pdf
diff --git a/playa/converter.py b/playa/converter.py
@@ -8,9 +8,8 @@
 )
 
 from playa.layout import (
-    LAParams,
-    LTComponent,
     LTChar,
+    LTComponent,
     LTCurve,
     LTFigure,
     LTImage,
@@ -23,9 +22,9 @@
 from playa.pdfdevice import PDFTextDevice
 from playa.pdffont import PDFFont, PDFUnicodeNotDefined
 from playa.pdfinterp import PDFGraphicState, PDFResourceManager, PDFStackT
-from playa.psparser import PSLiteral
 from playa.pdfpage import PDFPage
 from playa.pdftypes import PDFStream
+from playa.psparser import PSLiteral
 from playa.utils import (
     Matrix,
     PathSegment,
@@ -49,11 +48,9 @@ def __init__(
         self,
         rsrcmgr: PDFResourceManager,
         pageno: int = 1,
-        laparams: Optional[LAParams] = None,
     ) -> None:
         PDFTextDevice.__init__(self, rsrcmgr)
         self.pageno = pageno
-        self.laparams = laparams
         self._stack: List[LTLayoutContainer] = []
 
     def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
@@ -66,8 +63,6 @@ def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
     def end_page(self, page: PDFPage) -> None:
         assert not self._stack, str(len(self._stack))
         assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
-        if self.laparams is not None:
-            self.cur_item.analyze(self.laparams)
         self.pageno += 1
         self.receive_layout(self.cur_item)
 
@@ -178,7 +173,8 @@ def paint_path(
                     gstate.ncolor,
                     original_path=transformed_path,
                     dashing_style=gstate.dash,
-                    ncs=ncs, scs=scs
+                    ncs=ncs,
+                    scs=scs,
                 )
                 self.add_item(line)
 
@@ -200,7 +196,8 @@ def paint_path(
                         gstate.ncolor,
                         transformed_path,
                         gstate.dash,
-                        ncs, scs
+                        ncs,
+                        scs,
                     )
                     self.add_item(rect)
                 else:
@@ -214,7 +211,8 @@ def paint_path(
                         gstate.ncolor,
                         transformed_path,
                         gstate.dash,
-                        ncs, scs
+                        ncs,
+                        scs,
                     )
                     self.add_item(curve)
             else:
@@ -228,7 +226,8 @@ def paint_path(
                     gstate.ncolor,
                     transformed_path,
                     gstate.dash,
-                    ncs, scs
+                    ncs,
+                    scs,
                 )
                 self.add_item(curve)
 
@@ -240,8 +239,8 @@ def render_char(
         scaling: float,
         rise: float,
         cid: int,
+        ncs: PDFColorSpace,
         graphicstate: PDFGraphicState,
-        ncs: Optional[PDFColorSpace] = None,
         scs: Optional[PDFColorSpace] = None,
     ) -> float:
         try:
@@ -260,9 +259,11 @@ def render_char(
             text,
             textwidth,
             textdisp,
-            graphicstate,
             ncs,
+            graphicstate,
             scs,
+            graphicstate.scolor,
+            graphicstate.ncolor,
         )
         self.add_item(item)
         return item.adv
@@ -280,9 +281,8 @@ def __init__(
         self,
         rsrcmgr: PDFResourceManager,
         pageno: int = 1,
-        laparams: Optional[LAParams] = None,
     ) -> None:
-        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
+        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno)
         self.result: Optional[LTPage] = None
 
     def receive_layout(self, ltpage: LTPage) -> None:

diff --git a/playa/data_structures.py b/playa/data_structures.py
@@ -1,15 +1,19 @@
-from typing import Any, Dict, Iterator, List, Tuple
+from typing import Any, Dict, Iterator, Tuple, Union
 
-from playa import settings
-from playa.pdfparser import PDFSyntaxError
-from playa.pdftypes import dict_value, int_value, list_value
+from playa.pdftypes import dict_value, int_value, list_value, str_value
 from playa.utils import choplist
 
 
-def walk_number_tree(tree: Dict[str, Any]) -> Iterator[Tuple[int, Any]]:
+def walk_number_tree(
+    tree: Dict[str, Any], key: Union[int, None] = None
+) -> Iterator[Tuple[int, Any]]:
     stack = [tree]
     while stack:
         item = dict_value(stack.pop())
+        if key is not None and "Limits" in item:
+            (k1, k2) = list_value(item["Limits"])
+            if key < k1 or k2 < key:
+                continue
         if "Nums" in item:
             for k, v in choplist(2, list_value(item["Nums"])):
                 yield int_value(k), v
@@ -29,26 +33,56 @@ def __init__(self, obj: Any):
     def __iter__(self) -> Iterator[Tuple[int, Any]]:
         return walk_number_tree(self._obj)
 
-    def __contains__(self, num) -> bool:
-        for idx, val in self:
+    def __contains__(self, num: int) -> bool:
+        for idx, val in walk_number_tree(self._obj, num):
             if idx == num:
                 return True
         return False
 
-    def __getitem__(self, num) -> Any:
-        for idx, val in self:
+    def __getitem__(self, num: int) -> Any:
+        for idx, val in walk_number_tree(self._obj, num):
             if idx == num:
                 return val
         raise IndexError(f"Number {num} not in tree")
 
-    @property
-    def values(self) -> List[Tuple[int, Any]]:
-        values = list(self)
-        # NOTE: They are supposed to be sorted! (but, I suppose, often aren't)
-        if settings.STRICT:
-            if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
-                raise PDFSyntaxError("Number tree elements are out of order")
-        else:
-            values.sort(key=lambda t: t[0])
-
-        return values
+
+def walk_name_tree(
+    tree: Dict[str, Any], key: Union[bytes, None] = None
+) -> Iterator[Tuple[bytes, Any]]:
+    stack = [tree]
+    while stack:
+        item = dict_value(stack.pop())
+        if key is not None and "Limits" in item:
+            (k1, k2) = list_value(item["Limits"])
+            if key < k1 or k2 < key:
+                continue
+        if "Names" in item:
+            for k, v in choplist(2, list_value(item["Names"])):
+                yield str_value(k), v
+        if "Kids" in item:
+            stack.extend(reversed(list_value(item["Kids"])))
+
+
+class NameTree:
+    """A PDF name tree.
+
+    See Section 7.9.6 of the PDF 1.7 Reference.
+    """
+
+    def __init__(self, obj: Any):
+        self._obj = dict_value(obj)
+
+    def __iter__(self) -> Iterator[Tuple[bytes, Any]]:
+        return walk_name_tree(self._obj, None)
+
+    def __contains__(self, name: bytes) -> bool:
+        for idx, val in self:
+            if idx == name:
+                return True
+        return False
+
+    def __getitem__(self, name: bytes) -> Any:
+        for idx, val in self:
+            if idx == name:
+                return val
+        raise IndexError("Name %r not in tree" % name)
diff --git a/playa/exceptions.py b/playa/exceptions.py
@@ -39,10 +39,6 @@ class PDFValueError(PDFException, ValueError):
     pass
 
 
-class PDFObjectNotFound(PDFException):
-    pass
-
-
 class PDFNotImplementedError(PDFException, NotImplementedError):
     pass
 
@@ -79,10 +75,6 @@ class PDFNoStructTree(PDFException):
     pass
 
 
-class PDFDestinationNotFound(PDFException):
-    pass
-
-
 class PDFEncryptionError(PDFException):
     pass
-Original file line number
+Diff line change
@@ Expand Up / @@ -39,10 +39,6 @@ class PDFValueError(PDFException, ValueError): @@
         pass
-    class PDFObjectNotFound(PDFException):
-        pass
     class PDFNotImplementedError(PDFException, NotImplementedError):
         pass
@@ Expand Down Expand Up / @@ -79,10 +75,6 @@ class PDFNoStructTree(PDFException): @@
         pass
-    class PDFDestinationNotFound(PDFException):
-        pass
     class PDFEncryptionError(PDFException):
         pass
@@ Expand Down @@