Skip to content

Commit

Permalink
refactor!: PDFEliminate PDFExtra PDFCharacters PDFEverwhere PDFWe PDF…
Browse files Browse the repository at this point in the history
…Have PDFNamespaces PDFAfter PDFAll
  • Loading branch information
dhdaines committed Oct 29, 2024
1 parent 1a12046 commit 2b375b8
Show file tree
Hide file tree
Showing 14 changed files with 97 additions and 68 deletions.
2 changes: 1 addition & 1 deletion playa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from os import PathLike
from typing import Union

from playa.pdfdocument import PDFDocument
from playa.document import PDFDocument

__version__ = "0.0.1"

Expand Down
File renamed without changes.
33 changes: 18 additions & 15 deletions playa/pdfdocument.py → playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,19 @@
PDFTypeError,
PSException,
)
from playa.pdffont import (
from playa.font import (
PDFCIDFont,
PDFFont,
PDFTrueTypeFont,
PDFType1Font,
PDFType3Font,
)
from playa.pdfpage import PDFPage
from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
from playa.page import PDFPage
from playa.parser import KEYWORD_XREF, PDFParser, ContentStreamParser
from playa.pdftypes import (
DecipherCallable,
PDFObjRef,
PDFStream,
ObjRef,
ContentStream,
decipher_all,
dict_value,
int_value,
Expand Down Expand Up @@ -149,7 +149,8 @@ def _load(self, parser: PDFParser) -> None:
def _load_trailer(self, parser: PDFParser) -> None:
try:
(_, kwd) = parser.nexttoken()
assert kwd is KWD(b"trailer"), str(kwd)
if kwd is not KWD(b"trailer"):
raise PDFSyntaxError("Expected b'trailer', got %r", kwd)
(_, dic) = next(parser)
except StopIteration:
x = parser.pop(1)
Expand Down Expand Up @@ -200,7 +201,7 @@ def _load(self, parser: PDFParser) -> None:
# expand ObjStm.
parser.seek(pos)
(_, obj) = next(parser)
if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
if isinstance(obj, ContentStream) and obj.get("Type") is LITERAL_OBJSTM:
stream = stream_value(obj)
try:
n = stream["N"]
Expand All @@ -211,7 +212,7 @@ def _load(self, parser: PDFParser) -> None:
doc = parser.doc()
if doc is None:
raise RuntimeError("Document no longer exists!")
parser1 = PDFStreamParser(stream.get_data(), doc)
parser1 = ContentStreamParser(stream.get_data(), doc)
objs: List = [obj for _, obj in parser1]
# FIXME: This is choplist
n = min(n, len(objs) // 2)
Expand Down Expand Up @@ -240,7 +241,7 @@ def _load(self, parser: PDFParser) -> None:
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = next(parser)
if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
if not isinstance(stream, ContentStream) or stream.get("Type") is not LITERAL_XREF:
raise PDFNoValidXRef(f"Invalid PDF stream spec {stream!r}")
size = stream["Size"]
index_array = stream.get("Index", (0, size))
Expand Down Expand Up @@ -688,7 +689,7 @@ class OutlineItem(NamedTuple):
# FIXME: Create Destination and Action types
dest: Union[PSLiteral, bytes, list, None]
action: Union[dict, None]
se: Union[PDFObjRef, None]
se: Union[ObjRef, None]


class PDFDocument:
Expand Down Expand Up @@ -794,6 +795,8 @@ def __init__(
if self.catalog.get("Type") is not LITERAL_CATALOG:
if settings.STRICT:
raise PDFSyntaxError("Catalog not found!")
# Return to the start in the event that somebody wishes to
# iterate over top-level objects
self.parser.seek(0)

def _initialize_password(self, password: str = "") -> None:
Expand Down Expand Up @@ -828,7 +831,7 @@ def __iter__(self) -> Iterator[Tuple[int, object]]:
"""Iterate over positions and top-level PDF objects in the file."""
return self.parser

def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
def _getobj_objstm(self, stream: ContentStream, index: int, objid: int) -> object:
if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid]
else:
Expand All @@ -842,7 +845,7 @@ def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
raise PDFSyntaxError("index too big: %r" % index)
return obj

def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
def _get_objects(self, stream: ContentStream) -> Tuple[List[object], int]:
if stream.get("Type") is not LITERAL_OBJSTM:
if settings.STRICT:
raise PDFSyntaxError("Not a stream object: %r" % stream)
Expand All @@ -852,7 +855,7 @@ def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
if settings.STRICT:
raise PDFSyntaxError("N is not defined: %r" % stream)
n = 0
parser = PDFStreamParser(stream.get_data(), self)
parser = ContentStreamParser(stream.get_data(), self)
objs: List[object] = [obj for _, obj in parser]
return (objs, n)

Expand Down Expand Up @@ -919,7 +922,7 @@ def __getitem__(self, objid: int) -> object:
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)

if isinstance(obj, PDFStream):
if isinstance(obj, ContentStream):
obj.set_objid(objid, genno)
break
except (StopIteration, PDFSyntaxError):
Expand Down Expand Up @@ -1047,7 +1050,7 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
visited = set()
while stack:
(obj, parent) = stack.pop()
if isinstance(obj, PDFObjRef):
if isinstance(obj, ObjRef):
# The PDF specification *requires* both the Pages
# element of the catalog and the entries in Kids in
# the page tree to be indirect references.
Expand Down
8 changes: 4 additions & 4 deletions playa/pdffont.py → playa/font.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
)
from playa.fontmetrics import FONT_METRICS
from playa.pdftypes import (
PDFStream,
ContentStream,
dict_value,
int_value,
list_value,
Expand Down Expand Up @@ -1069,7 +1069,7 @@ def __init__(
ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
self.unicode_map: Optional[UnicodeMap] = None
if "ToUnicode" in spec:
if isinstance(spec["ToUnicode"], PDFStream):
if isinstance(spec["ToUnicode"], ContentStream):
strm = stream_value(spec["ToUnicode"])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, strm.get_data()).run()
Expand Down Expand Up @@ -1147,8 +1147,8 @@ def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
if strict:
raise PDFFontError("Encoding is unspecified")

if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
if type(cmap_name) is ContentStream: # type: ignore[comparison-overlap]
cmap_name_stream: ContentStream = cast(ContentStream, cmap_name)
if "CMapName" in cmap_name_stream:
cmap_name = cmap_name_stream.get("CMapName").name
elif strict:
Expand Down
2 changes: 1 addition & 1 deletion playa/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from playa.exceptions import PDFValueError
from playa.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
from playa.layout import LTImage
from playa.pdfcolor import (
from playa.color import (
LITERAL_DEVICE_CMYK,
LITERAL_DEVICE_GRAY,
LITERAL_DEVICE_RGB,
Expand Down
8 changes: 4 additions & 4 deletions playa/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
)

from playa.exceptions import PDFValueError
from playa.pdfcolor import PDFColorSpace
from playa.pdffont import PDFFont
from playa.pdftypes import PDFStream
from playa.color import PDFColorSpace
from playa.font import PDFFont
from playa.pdftypes import ContentStream
from playa.utils import (
INF,
Matrix,
Expand Down Expand Up @@ -298,7 +298,7 @@ class LTImage(LTComponent):
Embedded images can be in JPEG, Bitmap or JBIG2.
"""

def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
def __init__(self, name: str, stream: ContentStream, bbox: Rect) -> None:
LTComponent.__init__(self, bbox)
self.name = name
self.stream = stream
Expand Down
24 changes: 12 additions & 12 deletions playa/pdfpage.py → playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@
LTRect,
PDFGraphicState,
)
from playa.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
from playa.pdffont import (
from playa.color import PREDEFINED_COLORSPACE, PDFColorSpace
from playa.font import (
PDFFont,
)
from playa.pdftypes import (
LITERALS_ASCII85_DECODE,
PDFObjRef,
PDFStream,
ObjRef,
ContentStream,
dict_value,
int_value,
list_value,
Expand Down Expand Up @@ -76,7 +76,7 @@
)

if TYPE_CHECKING:
from playa.pdfdocument import PDFDocument
from playa.document import PDFDocument

log = logging.getLogger(__name__)

Expand All @@ -99,7 +99,7 @@ class PDFPage:
----------
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
contents: a list of ContentStream objects that represents the page content.
resources: a dictionary of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
Expand Down Expand Up @@ -240,7 +240,7 @@ def reset(self) -> None:
KEYWORD_EI = KWD(b"EI")


class PDFContentParser(Parser[Union[PSKeyword, PDFStream]]):
class PDFContentParser(Parser[Union[PSKeyword, ContentStream]]):
"""Parse the concatenation of multiple content streams, as
described in the spec (PDF 1.7, p.86):
Expand Down Expand Up @@ -312,7 +312,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
(pos, data) = self.get_inline_data(target=eos)
if pos == -1:
raise PDFSyntaxError("End of inline stream %r not found" % eos)
obj = PDFStream(d, data)
obj = ContentStream(d, data)
self.push((pos, obj))
# This was included in the data but we need to "parse" it
if eos == b"EI":
Expand All @@ -324,7 +324,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
self.push((pos, token))


PDFStackT = PSStackType[PDFStream]
PDFStackT = PSStackType[ContentStream]
"""Types that may appear on the PDF argument stack."""


Expand Down Expand Up @@ -389,7 +389,7 @@ def add_item(self, item: LTComponent) -> None:
item.tag = self.cur_tag
self.cur_item.add(item)

def render_image(self, name: str, stream: PDFStream) -> None:
def render_image(self, name: str, stream: ContentStream) -> None:
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
item = LTImage(
name,
Expand Down Expand Up @@ -760,7 +760,7 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
if k == "Font":
for fontid, spec in dict_value(v).items():
objid = None
if isinstance(spec, PDFObjRef):
if isinstance(spec, ObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = doc.get_font(objid, spec)
Expand Down Expand Up @@ -1292,7 +1292,7 @@ def do_ID(self) -> None:

def do_EI(self, obj: PDFStackT) -> None:
"""End inline image object"""
if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
if isinstance(obj, ContentStream) and "W" in obj and "H" in obj:
iobjid = str(id(obj))
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(iobjid, obj)
Expand Down
20 changes: 10 additions & 10 deletions playa/pdfparser.py → playa/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from playa import settings
from playa.casting import safe_int
from playa.exceptions import PDFSyntaxError
from playa.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
from playa.pdftypes import ObjRef, ContentStream, dict_value, int_value
from playa.psparser import KWD, Parser, PSKeyword

if TYPE_CHECKING:
from playa.pdfdocument import PDFDocument
from playa.document import PDFDocument

log = logging.getLogger(__name__)

Expand All @@ -24,8 +24,8 @@
KEYWORD_OBJ = KWD(b"obj")


# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
class PDFParser(Parser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
# PDFParser stack holds all the base types plus ContentStream, ObjRef, and None
class PDFParser(Parser[Union[PSKeyword, ContentStream, ObjRef, None]]):
"""PDFParser fetches PDF objects from a file stream.
It holds a weak reference to the document in order to
resolve indirect references. If the document is deleted
Expand Down Expand Up @@ -63,7 +63,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
(_, _object_id), _ = self.pop(2)
object_id = safe_int(_object_id)
if object_id is not None:
obj = PDFObjRef(self.doc, object_id)
obj = ObjRef(self.doc, object_id)
self.push((pos, obj))

elif token is KEYWORD_STREAM:
Expand Down Expand Up @@ -102,7 +102,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
self.seek(pos + objlen)
# XXX limit objlen not to exceed object boundary
log.debug(
"Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
"ContentStream: pos=%d, objlen=%d, dic=%r, data=%r...",
pos,
objlen,
dic,
Expand All @@ -111,16 +111,16 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
doc = self.doc()
if doc is None:
raise RuntimeError("Document no longer exists!")
stream = PDFStream(dic, bytes(data), doc.decipher)
stream = ContentStream(dic, bytes(data), doc.decipher)
self.push((pos, stream))

else:
# others
self.push((pos, token))


class PDFStreamParser(PDFParser):
"""PDFStreamParser is used to parse PDF content streams
class ContentStreamParser(PDFParser):
"""StreamParser is used to parse PDF content streams
that is contained in each page and has instructions
for rendering the page. A reference to a PDF document is
needed because a PDF content stream can also have
Expand All @@ -144,7 +144,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
)
object_id = safe_int(_object_id)
if object_id is not None:
obj = PDFObjRef(self.doc, object_id)
obj = ObjRef(self.doc, object_id)
self.push((pos, obj))
return

Expand Down
Loading

0 comments on commit 2b375b8

Please sign in to comment.