refactor!: PDFEliminate PDFExtra PDFCharacters PDFEverwhere PDFWe PDF…

…Have PDFNamespaces PDFAfter PDFAll
dhdaines · Oct 29, 2024 · 2b375b8 · 2b375b8
1 parent 1a12046
commit 2b375b8
Show file tree

Hide file tree

Showing 14 changed files with 97 additions and 68 deletions.
diff --git a/playa/__init__.py b/playa/__init__.py
@@ -10,7 +10,7 @@
 from os import PathLike
 from typing import Union
 
-from playa.pdfdocument import PDFDocument
+from playa.document import PDFDocument
 
 __version__ = "0.0.1"
 

diff --git a/playa/pdfcolor.py → playa/color.py b/playa/pdfcolor.py → playa/color.py
diff --git a/playa/pdfdocument.py → playa/document.py b/playa/pdfdocument.py → playa/document.py
@@ -44,19 +44,19 @@
     PDFTypeError,
     PSException,
 )
-from playa.pdffont import (
+from playa.font import (
     PDFCIDFont,
     PDFFont,
     PDFTrueTypeFont,
     PDFType1Font,
     PDFType3Font,
 )
-from playa.pdfpage import PDFPage
-from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
+from playa.page import PDFPage
+from playa.parser import KEYWORD_XREF, PDFParser, ContentStreamParser
 from playa.pdftypes import (
     DecipherCallable,
-    PDFObjRef,
-    PDFStream,
+    ObjRef,
+    ContentStream,
     decipher_all,
     dict_value,
     int_value,
@@ -149,7 +149,8 @@ def _load(self, parser: PDFParser) -> None:
     def _load_trailer(self, parser: PDFParser) -> None:
         try:
             (_, kwd) = parser.nexttoken()
-            assert kwd is KWD(b"trailer"), str(kwd)
+            if kwd is not KWD(b"trailer"):
+                raise PDFSyntaxError("Expected b'trailer', got %r", kwd)
             (_, dic) = next(parser)
         except StopIteration:
             x = parser.pop(1)
@@ -200,7 +201,7 @@ def _load(self, parser: PDFParser) -> None:
             # expand ObjStm.
             parser.seek(pos)
             (_, obj) = next(parser)
-            if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
+            if isinstance(obj, ContentStream) and obj.get("Type") is LITERAL_OBJSTM:
                 stream = stream_value(obj)
                 try:
                     n = stream["N"]
@@ -211,7 +212,7 @@ def _load(self, parser: PDFParser) -> None:
                 doc = parser.doc()
                 if doc is None:
                     raise RuntimeError("Document no longer exists!")
-                parser1 = PDFStreamParser(stream.get_data(), doc)
+                parser1 = ContentStreamParser(stream.get_data(), doc)
                 objs: List = [obj for _, obj in parser1]
                 # FIXME: This is choplist
                 n = min(n, len(objs) // 2)
@@ -240,7 +241,7 @@ def _load(self, parser: PDFParser) -> None:
         (_, genno) = parser.nexttoken()  # ignored
         (_, kwd) = parser.nexttoken()
         (_, stream) = next(parser)
-        if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
+        if not isinstance(stream, ContentStream) or stream.get("Type") is not LITERAL_XREF:
             raise PDFNoValidXRef(f"Invalid PDF stream spec {stream!r}")
         size = stream["Size"]
         index_array = stream.get("Index", (0, size))
@@ -688,7 +689,7 @@ class OutlineItem(NamedTuple):
     # FIXME: Create Destination and Action types
     dest: Union[PSLiteral, bytes, list, None]
     action: Union[dict, None]
-    se: Union[PDFObjRef, None]
+    se: Union[ObjRef, None]
 
 
 class PDFDocument:
@@ -794,6 +795,8 @@ def __init__(
         if self.catalog.get("Type") is not LITERAL_CATALOG:
             if settings.STRICT:
                 raise PDFSyntaxError("Catalog not found!")
+        # Return to the start in the event that somebody wishes to
+        # iterate over top-level objects
         self.parser.seek(0)
 
     def _initialize_password(self, password: str = "") -> None:
@@ -828,7 +831,7 @@ def __iter__(self) -> Iterator[Tuple[int, object]]:
         """Iterate over positions and top-level PDF objects in the file."""
         return self.parser
 
-    def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
+    def _getobj_objstm(self, stream: ContentStream, index: int, objid: int) -> object:
         if stream.objid in self._parsed_objs:
             (objs, n) = self._parsed_objs[stream.objid]
         else:
@@ -842,7 +845,7 @@ def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
             raise PDFSyntaxError("index too big: %r" % index)
         return obj
 
-    def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
+    def _get_objects(self, stream: ContentStream) -> Tuple[List[object], int]:
         if stream.get("Type") is not LITERAL_OBJSTM:
             if settings.STRICT:
                 raise PDFSyntaxError("Not a stream object: %r" % stream)
@@ -852,7 +855,7 @@ def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
             if settings.STRICT:
                 raise PDFSyntaxError("N is not defined: %r" % stream)
             n = 0
-        parser = PDFStreamParser(stream.get_data(), self)
+        parser = ContentStreamParser(stream.get_data(), self)
         objs: List[object] = [obj for _, obj in parser]
         return (objs, n)
 
@@ -919,7 +922,7 @@ def __getitem__(self, objid: int) -> object:
                         if self.decipher:
                             obj = decipher_all(self.decipher, objid, genno, obj)
 
-                    if isinstance(obj, PDFStream):
+                    if isinstance(obj, ContentStream):
                         obj.set_objid(objid, genno)
                     break
                 except (StopIteration, PDFSyntaxError):
@@ -1047,7 +1050,7 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
         visited = set()
         while stack:
             (obj, parent) = stack.pop()
-            if isinstance(obj, PDFObjRef):
+            if isinstance(obj, ObjRef):
                 # The PDF specification *requires* both the Pages
                 # element of the catalog and the entries in Kids in
                 # the page tree to be indirect references.

diff --git a/playa/pdffont.py → playa/font.py b/playa/pdffont.py → playa/font.py
@@ -35,7 +35,7 @@
 )
 from playa.fontmetrics import FONT_METRICS
 from playa.pdftypes import (
-    PDFStream,
+    ContentStream,
     dict_value,
     int_value,
     list_value,
@@ -1069,7 +1069,7 @@ def __init__(
             ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
         self.unicode_map: Optional[UnicodeMap] = None
         if "ToUnicode" in spec:
-            if isinstance(spec["ToUnicode"], PDFStream):
+            if isinstance(spec["ToUnicode"], ContentStream):
                 strm = stream_value(spec["ToUnicode"])
                 self.unicode_map = FileUnicodeMap()
                 CMapParser(self.unicode_map, strm.get_data()).run()
@@ -1147,8 +1147,8 @@ def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
             if strict:
                 raise PDFFontError("Encoding is unspecified")
 
-        if type(cmap_name) is PDFStream:  # type: ignore[comparison-overlap]
-            cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
+        if type(cmap_name) is ContentStream:  # type: ignore[comparison-overlap]
+            cmap_name_stream: ContentStream = cast(ContentStream, cmap_name)
             if "CMapName" in cmap_name_stream:
                 cmap_name = cmap_name_stream.get("CMapName").name
             elif strict:

diff --git a/playa/image.py b/playa/image.py
@@ -7,7 +7,7 @@
 from playa.exceptions import PDFValueError
 from playa.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
 from playa.layout import LTImage
-from playa.pdfcolor import (
+from playa.color import (
     LITERAL_DEVICE_CMYK,
     LITERAL_DEVICE_GRAY,
     LITERAL_DEVICE_RGB,

diff --git a/playa/layout.py b/playa/layout.py
@@ -12,9 +12,9 @@
 )
 
 from playa.exceptions import PDFValueError
-from playa.pdfcolor import PDFColorSpace
-from playa.pdffont import PDFFont
-from playa.pdftypes import PDFStream
+from playa.color import PDFColorSpace
+from playa.font import PDFFont
+from playa.pdftypes import ContentStream
 from playa.utils import (
     INF,
     Matrix,
@@ -298,7 +298,7 @@ class LTImage(LTComponent):
     Embedded images can be in JPEG, Bitmap or JBIG2.
     """
 
-    def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
+    def __init__(self, name: str, stream: ContentStream, bbox: Rect) -> None:
         LTComponent.__init__(self, bbox)
         self.name = name
         self.stream = stream

diff --git a/playa/pdfpage.py → playa/page.py b/playa/pdfpage.py → playa/page.py
@@ -35,14 +35,14 @@
     LTRect,
     PDFGraphicState,
 )
-from playa.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
-from playa.pdffont import (
+from playa.color import PREDEFINED_COLORSPACE, PDFColorSpace
+from playa.font import (
     PDFFont,
 )
 from playa.pdftypes import (
     LITERALS_ASCII85_DECODE,
-    PDFObjRef,
-    PDFStream,
+    ObjRef,
+    ContentStream,
     dict_value,
     int_value,
     list_value,
@@ -76,7 +76,7 @@
 )
 
 if TYPE_CHECKING:
-    from playa.pdfdocument import PDFDocument
+    from playa.document import PDFDocument
 
 log = logging.getLogger(__name__)
 
@@ -99,7 +99,7 @@ class PDFPage:
     ----------
       pageid: any Python object that can uniquely identify the page.
       attrs: a dictionary of page attributes.
-      contents: a list of PDFStream objects that represents the page content.
+      contents: a list of ContentStream objects that represents the page content.
       resources: a dictionary of resources used by the page.
       mediabox: the physical size of the page.
       cropbox: the crop rectangle of the page.
@@ -240,7 +240,7 @@ def reset(self) -> None:
 KEYWORD_EI = KWD(b"EI")
 
 
-class PDFContentParser(Parser[Union[PSKeyword, PDFStream]]):
+class PDFContentParser(Parser[Union[PSKeyword, ContentStream]]):
     """Parse the concatenation of multiple content streams, as
     described in the spec (PDF 1.7, p.86):
 
@@ -312,7 +312,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                     (pos, data) = self.get_inline_data(target=eos)
                 if pos == -1:
                     raise PDFSyntaxError("End of inline stream %r not found" % eos)
-                obj = PDFStream(d, data)
+                obj = ContentStream(d, data)
                 self.push((pos, obj))
                 # This was included in the data but we need to "parse" it
                 if eos == b"EI":
@@ -324,7 +324,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
             self.push((pos, token))
 
 
-PDFStackT = PSStackType[PDFStream]
+PDFStackT = PSStackType[ContentStream]
 """Types that may appear on the PDF argument stack."""
 
 
@@ -389,7 +389,7 @@ def add_item(self, item: LTComponent) -> None:
         item.tag = self.cur_tag
         self.cur_item.add(item)
 
-    def render_image(self, name: str, stream: PDFStream) -> None:
+    def render_image(self, name: str, stream: ContentStream) -> None:
         assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
         item = LTImage(
             name,
@@ -760,7 +760,7 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
             if k == "Font":
                 for fontid, spec in dict_value(v).items():
                     objid = None
-                    if isinstance(spec, PDFObjRef):
+                    if isinstance(spec, ObjRef):
                         objid = spec.objid
                     spec = dict_value(spec)
                     self.fontmap[fontid] = doc.get_font(objid, spec)
@@ -1292,7 +1292,7 @@ def do_ID(self) -> None:
 
     def do_EI(self, obj: PDFStackT) -> None:
         """End inline image object"""
-        if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
+        if isinstance(obj, ContentStream) and "W" in obj and "H" in obj:
             iobjid = str(id(obj))
             self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
             self.device.render_image(iobjid, obj)

diff --git a/playa/pdfparser.py → playa/parser.py b/playa/pdfparser.py → playa/parser.py
@@ -6,11 +6,11 @@
 from playa import settings
 from playa.casting import safe_int
 from playa.exceptions import PDFSyntaxError
-from playa.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
+from playa.pdftypes import ObjRef, ContentStream, dict_value, int_value
 from playa.psparser import KWD, Parser, PSKeyword
 
 if TYPE_CHECKING:
-    from playa.pdfdocument import PDFDocument
+    from playa.document import PDFDocument
 
 log = logging.getLogger(__name__)
 
@@ -24,8 +24,8 @@
 KEYWORD_OBJ = KWD(b"obj")
 
 
-# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
-class PDFParser(Parser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
+# PDFParser stack holds all the base types plus ContentStream, ObjRef, and None
+class PDFParser(Parser[Union[PSKeyword, ContentStream, ObjRef, None]]):
     """PDFParser fetches PDF objects from a file stream.
     It holds a weak reference to the document in order to
     resolve indirect references.  If the document is deleted
@@ -63,7 +63,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                 (_, _object_id), _ = self.pop(2)
                 object_id = safe_int(_object_id)
                 if object_id is not None:
-                    obj = PDFObjRef(self.doc, object_id)
+                    obj = ObjRef(self.doc, object_id)
                     self.push((pos, obj))
 
         elif token is KEYWORD_STREAM:
@@ -102,7 +102,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
             self.seek(pos + objlen)
             # XXX limit objlen not to exceed object boundary
             log.debug(
-                "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
+                "ContentStream: pos=%d, objlen=%d, dic=%r, data=%r...",
                 pos,
                 objlen,
                 dic,
@@ -111,16 +111,16 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
             doc = self.doc()
             if doc is None:
                 raise RuntimeError("Document no longer exists!")
-            stream = PDFStream(dic, bytes(data), doc.decipher)
+            stream = ContentStream(dic, bytes(data), doc.decipher)
             self.push((pos, stream))
 
         else:
             # others
             self.push((pos, token))
 
 
-class PDFStreamParser(PDFParser):
-    """PDFStreamParser is used to parse PDF content streams
+class ContentStreamParser(PDFParser):
+    """StreamParser is used to parse PDF content streams
     that is contained in each page and has instructions
     for rendering the page. A reference to a PDF document is
     needed because a PDF content stream can also have
@@ -144,7 +144,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                 )
             object_id = safe_int(_object_id)
             if object_id is not None:
-                obj = PDFObjRef(self.doc, object_id)
+                obj = ObjRef(self.doc, object_id)
                 self.push((pos, obj))
             return