diff --git a/playa/__init__.py b/playa/__init__.py index 20f87e01..1020e0d8 100644 --- a/playa/__init__.py +++ b/playa/__init__.py @@ -10,7 +10,7 @@ from os import PathLike from typing import Union -from playa.pdfdocument import PDFDocument +from playa.document import PDFDocument __version__ = "0.0.1" diff --git a/playa/pdfcolor.py b/playa/color.py similarity index 100% rename from playa/pdfcolor.py rename to playa/color.py diff --git a/playa/pdfdocument.py b/playa/document.py similarity index 97% rename from playa/pdfdocument.py rename to playa/document.py index a3468f54..fed82352 100644 --- a/playa/pdfdocument.py +++ b/playa/document.py @@ -44,19 +44,19 @@ PDFTypeError, PSException, ) -from playa.pdffont import ( +from playa.font import ( PDFCIDFont, PDFFont, PDFTrueTypeFont, PDFType1Font, PDFType3Font, ) -from playa.pdfpage import PDFPage -from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser +from playa.page import PDFPage +from playa.parser import KEYWORD_XREF, PDFParser, ContentStreamParser from playa.pdftypes import ( DecipherCallable, - PDFObjRef, - PDFStream, + ObjRef, + ContentStream, decipher_all, dict_value, int_value, @@ -149,7 +149,8 @@ def _load(self, parser: PDFParser) -> None: def _load_trailer(self, parser: PDFParser) -> None: try: (_, kwd) = parser.nexttoken() - assert kwd is KWD(b"trailer"), str(kwd) + if kwd is not KWD(b"trailer"): + raise PDFSyntaxError("Expected b'trailer', got %r", kwd) (_, dic) = next(parser) except StopIteration: x = parser.pop(1) @@ -200,7 +201,7 @@ def _load(self, parser: PDFParser) -> None: # expand ObjStm. parser.seek(pos) (_, obj) = next(parser) - if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM: + if isinstance(obj, ContentStream) and obj.get("Type") is LITERAL_OBJSTM: stream = stream_value(obj) try: n = stream["N"] @@ -211,7 +212,7 @@ def _load(self, parser: PDFParser) -> None: doc = parser.doc() if doc is None: raise RuntimeError("Document no longer exists!") - parser1 = PDFStreamParser(stream.get_data(), doc) + parser1 = ContentStreamParser(stream.get_data(), doc) objs: List = [obj for _, obj in parser1] # FIXME: This is choplist n = min(n, len(objs) // 2) @@ -240,7 +241,7 @@ def _load(self, parser: PDFParser) -> None: (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() (_, stream) = next(parser) - if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF: + if not isinstance(stream, ContentStream) or stream.get("Type") is not LITERAL_XREF: raise PDFNoValidXRef(f"Invalid PDF stream spec {stream!r}") size = stream["Size"] index_array = stream.get("Index", (0, size)) @@ -688,7 +689,7 @@ class OutlineItem(NamedTuple): # FIXME: Create Destination and Action types dest: Union[PSLiteral, bytes, list, None] action: Union[dict, None] - se: Union[PDFObjRef, None] + se: Union[ObjRef, None] class PDFDocument: @@ -794,6 +795,8 @@ def __init__( if self.catalog.get("Type") is not LITERAL_CATALOG: if settings.STRICT: raise PDFSyntaxError("Catalog not found!") + # Return to the start in the event that somebody wishes to + # iterate over top-level objects self.parser.seek(0) def _initialize_password(self, password: str = "") -> None: @@ -828,7 +831,7 @@ def __iter__(self) -> Iterator[Tuple[int, object]]: """Iterate over positions and top-level PDF objects in the file.""" return self.parser - def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object: + def _getobj_objstm(self, stream: ContentStream, index: int, objid: int) -> object: if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] else: @@ -842,7 +845,7 @@ def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object: raise PDFSyntaxError("index too big: %r" % index) return obj - def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]: + def _get_objects(self, stream: ContentStream) -> Tuple[List[object], int]: if stream.get("Type") is not LITERAL_OBJSTM: if settings.STRICT: raise PDFSyntaxError("Not a stream object: %r" % stream) @@ -852,7 +855,7 @@ def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]: if settings.STRICT: raise PDFSyntaxError("N is not defined: %r" % stream) n = 0 - parser = PDFStreamParser(stream.get_data(), self) + parser = ContentStreamParser(stream.get_data(), self) objs: List[object] = [obj for _, obj in parser] return (objs, n) @@ -919,7 +922,7 @@ def __getitem__(self, objid: int) -> object: if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) - if isinstance(obj, PDFStream): + if isinstance(obj, ContentStream): obj.set_objid(objid, genno) break except (StopIteration, PDFSyntaxError): @@ -1047,7 +1050,7 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]: visited = set() while stack: (obj, parent) = stack.pop() - if isinstance(obj, PDFObjRef): + if isinstance(obj, ObjRef): # The PDF specification *requires* both the Pages # element of the catalog and the entries in Kids in # the page tree to be indirect references. diff --git a/playa/pdffont.py b/playa/font.py similarity index 99% rename from playa/pdffont.py rename to playa/font.py index ba75c5a1..e370d2ef 100644 --- a/playa/pdffont.py +++ b/playa/font.py @@ -35,7 +35,7 @@ ) from playa.fontmetrics import FONT_METRICS from playa.pdftypes import ( - PDFStream, + ContentStream, dict_value, int_value, list_value, @@ -1069,7 +1069,7 @@ def __init__( ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) self.unicode_map: Optional[UnicodeMap] = None if "ToUnicode" in spec: - if isinstance(spec["ToUnicode"], PDFStream): + if isinstance(spec["ToUnicode"], ContentStream): strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, strm.get_data()).run() @@ -1147,8 +1147,8 @@ def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: if strict: raise PDFFontError("Encoding is unspecified") - if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] - cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) + if type(cmap_name) is ContentStream: # type: ignore[comparison-overlap] + cmap_name_stream: ContentStream = cast(ContentStream, cmap_name) if "CMapName" in cmap_name_stream: cmap_name = cmap_name_stream.get("CMapName").name elif strict: diff --git a/playa/image.py b/playa/image.py index 714bd2af..11421201 100644 --- a/playa/image.py +++ b/playa/image.py @@ -7,7 +7,7 @@ from playa.exceptions import PDFValueError from playa.jbig2 import JBIG2StreamReader, JBIG2StreamWriter from playa.layout import LTImage -from playa.pdfcolor import ( +from playa.color import ( LITERAL_DEVICE_CMYK, LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, diff --git a/playa/layout.py b/playa/layout.py index 0085b12f..8eed5ae3 100644 --- a/playa/layout.py +++ b/playa/layout.py @@ -12,9 +12,9 @@ ) from playa.exceptions import PDFValueError -from playa.pdfcolor import PDFColorSpace -from playa.pdffont import PDFFont -from playa.pdftypes import PDFStream +from playa.color import PDFColorSpace +from playa.font import PDFFont +from playa.pdftypes import ContentStream from playa.utils import ( INF, Matrix, @@ -298,7 +298,7 @@ class LTImage(LTComponent): Embedded images can be in JPEG, Bitmap or JBIG2. """ - def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: + def __init__(self, name: str, stream: ContentStream, bbox: Rect) -> None: LTComponent.__init__(self, bbox) self.name = name self.stream = stream diff --git a/playa/pdfpage.py b/playa/page.py similarity index 98% rename from playa/pdfpage.py rename to playa/page.py index f9b2e4e0..f6be866c 100644 --- a/playa/pdfpage.py +++ b/playa/page.py @@ -35,14 +35,14 @@ LTRect, PDFGraphicState, ) -from playa.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace -from playa.pdffont import ( +from playa.color import PREDEFINED_COLORSPACE, PDFColorSpace +from playa.font import ( PDFFont, ) from playa.pdftypes import ( LITERALS_ASCII85_DECODE, - PDFObjRef, - PDFStream, + ObjRef, + ContentStream, dict_value, int_value, list_value, @@ -76,7 +76,7 @@ ) if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument + from playa.document import PDFDocument log = logging.getLogger(__name__) @@ -99,7 +99,7 @@ class PDFPage: ---------- pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. - contents: a list of PDFStream objects that represents the page content. + contents: a list of ContentStream objects that represents the page content. resources: a dictionary of resources used by the page. mediabox: the physical size of the page. cropbox: the crop rectangle of the page. @@ -240,7 +240,7 @@ def reset(self) -> None: KEYWORD_EI = KWD(b"EI") -class PDFContentParser(Parser[Union[PSKeyword, PDFStream]]): +class PDFContentParser(Parser[Union[PSKeyword, ContentStream]]): """Parse the concatenation of multiple content streams, as described in the spec (PDF 1.7, p.86): @@ -312,7 +312,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: (pos, data) = self.get_inline_data(target=eos) if pos == -1: raise PDFSyntaxError("End of inline stream %r not found" % eos) - obj = PDFStream(d, data) + obj = ContentStream(d, data) self.push((pos, obj)) # This was included in the data but we need to "parse" it if eos == b"EI": @@ -324,7 +324,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: self.push((pos, token)) -PDFStackT = PSStackType[PDFStream] +PDFStackT = PSStackType[ContentStream] """Types that may appear on the PDF argument stack.""" @@ -389,7 +389,7 @@ def add_item(self, item: LTComponent) -> None: item.tag = self.cur_tag self.cur_item.add(item) - def render_image(self, name: str, stream: PDFStream) -> None: + def render_image(self, name: str, stream: ContentStream) -> None: assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) item = LTImage( name, @@ -760,7 +760,7 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]: if k == "Font": for fontid, spec in dict_value(v).items(): objid = None - if isinstance(spec, PDFObjRef): + if isinstance(spec, ObjRef): objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = doc.get_font(objid, spec) @@ -1292,7 +1292,7 @@ def do_ID(self) -> None: def do_EI(self, obj: PDFStackT) -> None: """End inline image object""" - if isinstance(obj, PDFStream) and "W" in obj and "H" in obj: + if isinstance(obj, ContentStream) and "W" in obj and "H" in obj: iobjid = str(id(obj)) self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY) self.device.render_image(iobjid, obj) diff --git a/playa/pdfparser.py b/playa/parser.py similarity index 89% rename from playa/pdfparser.py rename to playa/parser.py index ef787b2a..0670fb9b 100644 --- a/playa/pdfparser.py +++ b/playa/parser.py @@ -6,11 +6,11 @@ from playa import settings from playa.casting import safe_int from playa.exceptions import PDFSyntaxError -from playa.pdftypes import PDFObjRef, PDFStream, dict_value, int_value +from playa.pdftypes import ObjRef, ContentStream, dict_value, int_value from playa.psparser import KWD, Parser, PSKeyword if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument + from playa.document import PDFDocument log = logging.getLogger(__name__) @@ -24,8 +24,8 @@ KEYWORD_OBJ = KWD(b"obj") -# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None -class PDFParser(Parser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): +# PDFParser stack holds all the base types plus ContentStream, ObjRef, and None +class PDFParser(Parser[Union[PSKeyword, ContentStream, ObjRef, None]]): """PDFParser fetches PDF objects from a file stream. It holds a weak reference to the document in order to resolve indirect references. If the document is deleted @@ -63,7 +63,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: (_, _object_id), _ = self.pop(2) object_id = safe_int(_object_id) if object_id is not None: - obj = PDFObjRef(self.doc, object_id) + obj = ObjRef(self.doc, object_id) self.push((pos, obj)) elif token is KEYWORD_STREAM: @@ -102,7 +102,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: self.seek(pos + objlen) # XXX limit objlen not to exceed object boundary log.debug( - "Stream: pos=%d, objlen=%d, dic=%r, data=%r...", + "ContentStream: pos=%d, objlen=%d, dic=%r, data=%r...", pos, objlen, dic, @@ -111,7 +111,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: doc = self.doc() if doc is None: raise RuntimeError("Document no longer exists!") - stream = PDFStream(dic, bytes(data), doc.decipher) + stream = ContentStream(dic, bytes(data), doc.decipher) self.push((pos, stream)) else: @@ -119,8 +119,8 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: self.push((pos, token)) -class PDFStreamParser(PDFParser): - """PDFStreamParser is used to parse PDF content streams +class ContentStreamParser(PDFParser): + """StreamParser is used to parse PDF content streams that is contained in each page and has instructions for rendering the page. A reference to a PDF document is needed because a PDF content stream can also have @@ -144,7 +144,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: ) object_id = safe_int(_object_id) if object_id is not None: - obj = PDFObjRef(self.doc, object_id) + obj = ObjRef(self.doc, object_id) self.push((pos, obj)) return diff --git a/playa/pdftypes.py b/playa/pdftypes.py index 43f71cff..f703ee80 100644 --- a/playa/pdftypes.py +++ b/playa/pdftypes.py @@ -30,7 +30,7 @@ from playa.utils import apply_png_predictor if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument + from playa.document import PDFDocument logger = logging.getLogger(__name__) @@ -64,7 +64,7 @@ def __call__( _DEFAULT = object() -class PDFObjRef: +class ObjRef: def __init__( self, doc: weakref.ReferenceType["PDFDocument"], @@ -83,7 +83,7 @@ def __init__( self.objid = objid def __repr__(self) -> str: - return "" % (self.objid) + return "" % (self.objid) def resolve(self, default: object = None) -> Any: doc = self.doc() @@ -101,7 +101,7 @@ def resolve1(x: object, default: object = None) -> Any: If this is an array or dictionary, it may still contains some indirect objects inside. """ - while isinstance(x, PDFObjRef): + while isinstance(x, ObjRef): x = x.resolve(default=default) return x @@ -112,7 +112,7 @@ def resolve_all(x: object, default: object = None) -> Any: Make sure there is no indirect reference within the nested object. This procedure might be slow. """ - while isinstance(x, PDFObjRef): + while isinstance(x, ObjRef): x = x.resolve(default=default) if isinstance(x, list): x = [resolve_all(v, default=default) for v in x] @@ -200,12 +200,12 @@ def dict_value(x: object) -> Dict[Any, Any]: return x -def stream_value(x: object) -> "PDFStream": +def stream_value(x: object) -> "ContentStream": x = resolve1(x) - if not isinstance(x, PDFStream): + if not isinstance(x, ContentStream): if settings.STRICT: - raise PDFTypeError("PDFStream required: %r" % x) - return PDFStream({}, b"") + raise PDFTypeError("ContentStream required: %r" % x) + return ContentStream({}, b"") return x @@ -230,7 +230,7 @@ def decompress_corrupted(data: bytes) -> bytes: return result_str -class PDFStream: +class ContentStream: def __init__( self, attrs: Dict[str, Any], @@ -252,14 +252,14 @@ def set_objid(self, objid: int, genno: int) -> None: def __repr__(self) -> str: if self.data is None: assert self.rawdata is not None - return "" % ( + return "" % ( self.objid, len(self.rawdata), self.attrs, ) else: assert self.data is not None - return "" % ( + return "" % ( self.objid, len(self.data), self.attrs, diff --git a/playa/pdfstructtree.py b/playa/structtree.py similarity index 98% rename from playa/pdfstructtree.py rename to playa/structtree.py index 5f1c69e8..504103e9 100644 --- a/playa/pdfstructtree.py +++ b/playa/structtree.py @@ -17,16 +17,16 @@ from playa.data_structures import NumberTree from playa.exceptions import PDFNoStructTree -from playa.pdfpage import PDFPage -from playa.pdfparser import KEYWORD_NULL -from playa.pdftypes import PDFObjRef, resolve1 +from playa.page import PDFPage +from playa.parser import KEYWORD_NULL +from playa.pdftypes import ObjRef, resolve1 from playa.psparser import PSLiteral from playa.utils import decode_text logger = logging.getLogger(__name__) if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument + from playa.document import PDFDocument MatchFunc = Callable[["PDFStructElement"], bool] @@ -368,7 +368,7 @@ def _parse_struct_tree(self) -> None: child = obj["Obj"] elif "MCID" in obj: continue - if isinstance(child, PDFObjRef): + if isinstance(child, ObjRef): d.append(child) # Traverse depth-first, removing empty elements (unsure how to @@ -438,7 +438,7 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None: elif "Obj" in obj: child = obj["Obj"] # NOTE: if, not elif, in case of OBJR above - if isinstance(child, PDFObjRef): + if isinstance(child, ObjRef): child_element, _ = seen.get(repr(child), (None, None)) if child_element is not None: element.children.append(child_element) diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py index c8e089a1..044fe584 100644 --- a/tests/benchmark_parser.py +++ b/tests/benchmark_parser.py @@ -304,7 +304,7 @@ def bench_mmap(): def bench_playa(): - from playa.pdfdocument import PDFDocument + from playa.document import PDFDocument bench_bytes() bench_mmap() diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 9775ab70..646221be 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -11,7 +11,7 @@ import playa.settings from playa.data_structures import NameTree from playa.exceptions import PDFSyntaxError -from playa.pdfdocument import read_header +from playa.document import read_header from playa.utils import decode_text playa.settings.STRICT = True diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py new file mode 100644 index 00000000..bca21c37 --- /dev/null +++ b/tests/test_pdfparser.py @@ -0,0 +1,26 @@ +import logging +from typing import Any, List, Tuple +from pathlib import Path + +import pytest + +from playa.parser import PDFParser, ContentStream, ContentStreamParser + + +TESTDIR = Path(__file__).parent.parent / "samples" + + +class MockDoc: + def __call__(self): + return self + decipher = None + + +def test_indirect_objects(): + """Verify that indirect objects are parsed properly.""" + with open(TESTDIR / "simple2.pdf", "rb") as infh: + data = infh.read() + doc = MockDoc() + parser = PDFParser(data, doc) + for obj in parser: + print(obj) diff --git a/tests/test_pdfstructtree.py b/tests/test_pdfstructtree.py index ae5fe15b..8a45d519 100644 --- a/tests/test_pdfstructtree.py +++ b/tests/test_pdfstructtree.py @@ -3,7 +3,7 @@ from pathlib import Path import playa -from playa.pdfstructtree import PDFStructTree +from playa.structtree import PDFStructTree TESTDIR = Path(__file__).parent.parent / "samples"