From 8aaf9abde665474105de4a7123ef40d38229de5d Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 29 Oct 2024 14:38:15 -0400 Subject: [PATCH] refactor!: there can be only one (parser) --- playa/cmapdb.py | 2 +- playa/color.py | 2 +- playa/document.py | 38 ++- playa/encodingdb.py | 2 +- playa/font.py | 16 +- playa/image.py | 6 +- playa/layout.py | 2 +- playa/page.py | 26 +- playa/parser.py | 473 ++++++++++++++++++++++++++++++- playa/pdftypes.py | 118 +++++++- playa/psparser.py | 574 -------------------------------------- playa/structtree.py | 3 +- tests/benchmark_parser.py | 4 +- tests/test_pdfdocument.py | 2 +- tests/test_pdfparser.py | 8 +- tests/test_psparser.py | 8 +- 16 files changed, 642 insertions(+), 642 deletions(-) delete mode 100755 playa/psparser.py diff --git a/playa/cmapdb.py b/playa/cmapdb.py index 14d3979b..bd23e428 100644 --- a/playa/cmapdb.py +++ b/playa/cmapdb.py @@ -33,7 +33,7 @@ from playa.encodingdb import name2unicode from playa.exceptions import PDFException, PDFTypeError, PSSyntaxError -from playa.psparser import KWD, Parser, PSKeyword, PSLiteral, literal_name +from playa.parser import KWD, Parser, PSKeyword, PSLiteral, literal_name from playa.utils import choplist, nunpack log = logging.getLogger(__name__) diff --git a/playa/color.py b/playa/color.py index b4c2021f..1bc1bcb4 100644 --- a/playa/color.py +++ b/playa/color.py @@ -1,7 +1,7 @@ import collections from typing import Dict -from playa.psparser import LIT +from playa.parser import LIT LITERAL_DEVICE_GRAY = LIT("DeviceGray") LITERAL_DEVICE_RGB = LIT("DeviceRGB") diff --git a/playa/document.py b/playa/document.py index fed82352..acf4bc3d 100644 --- a/playa/document.py +++ b/playa/document.py @@ -44,19 +44,22 @@ PDFTypeError, PSException, ) -from playa.font import ( - PDFCIDFont, - PDFFont, - PDFTrueTypeFont, - PDFType1Font, - PDFType3Font, -) +from playa.font import PDFCIDFont, PDFFont, PDFTrueTypeFont, PDFType1Font, PDFType3Font from playa.page import PDFPage -from playa.parser import KEYWORD_XREF, PDFParser, ContentStreamParser +from playa.parser import ( + KEYWORD_OBJ, + KEYWORD_TRAILER, + KEYWORD_XREF, + LIT, + ContentStreamParser, + PDFParser, + PSLiteral, + literal_name, +) from playa.pdftypes import ( + ContentStream, DecipherCallable, ObjRef, - ContentStream, decipher_all, dict_value, int_value, @@ -66,7 +69,6 @@ stream_value, uint_value, ) -from playa.psparser import KWD, LIT, PSLiteral, literal_name from playa.utils import ( choplist, decode_text, @@ -88,7 +90,6 @@ LITERAL_CATALOG = LIT("Catalog") LITERAL_PAGE = LIT("Page") LITERAL_PAGES = LIT("Pages") -KEYWORD_OBJ = KWD(b"obj") INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} @@ -149,8 +150,14 @@ def _load(self, parser: PDFParser) -> None: def _load_trailer(self, parser: PDFParser) -> None: try: (_, kwd) = parser.nexttoken() - if kwd is not KWD(b"trailer"): - raise PDFSyntaxError("Expected b'trailer', got %r", kwd) + if kwd is not KEYWORD_TRAILER: + raise PDFSyntaxError( + "Expected %r, got %r" + % ( + KEYWORD_TRAILER, + kwd, + ) + ) (_, dic) = next(parser) except StopIteration: x = parser.pop(1) @@ -241,7 +248,10 @@ def _load(self, parser: PDFParser) -> None: (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() (_, stream) = next(parser) - if not isinstance(stream, ContentStream) or stream.get("Type") is not LITERAL_XREF: + if ( + not isinstance(stream, ContentStream) + or stream.get("Type") is not LITERAL_XREF + ): raise PDFNoValidXRef(f"Invalid PDF stream spec {stream!r}") size = stream["Size"] index_array = stream.get("Index", (0, size)) diff --git a/playa/encodingdb.py b/playa/encodingdb.py index c44a2742..259f1f14 100644 --- a/playa/encodingdb.py +++ b/playa/encodingdb.py @@ -5,7 +5,7 @@ from playa.exceptions import PDFKeyError from playa.glyphlist import glyphname2unicode from playa.latin_enc import ENCODING -from playa.psparser import PSLiteral +from playa.parser import PSLiteral HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") diff --git a/playa/font.py b/playa/font.py index e370d2ef..c35c5277 100644 --- a/playa/font.py +++ b/playa/font.py @@ -34,6 +34,14 @@ PDFValueError, ) from playa.fontmetrics import FONT_METRICS +from playa.parser import ( + KWD, + LIT, + Parser, + PSKeyword, + PSLiteral, + literal_name, +) from playa.pdftypes import ( ContentStream, dict_value, @@ -44,14 +52,6 @@ resolve_all, stream_value, ) -from playa.psparser import ( - KWD, - LIT, - Parser, - PSKeyword, - PSLiteral, - literal_name, -) from playa.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack log = logging.getLogger(__name__) diff --git a/playa/image.py b/playa/image.py index 11421201..5c35cfa4 100644 --- a/playa/image.py +++ b/playa/image.py @@ -4,9 +4,6 @@ from io import BytesIO from typing import BinaryIO, Literal, Tuple -from playa.exceptions import PDFValueError -from playa.jbig2 import JBIG2StreamReader, JBIG2StreamWriter -from playa.layout import LTImage from playa.color import ( LITERAL_DEVICE_CMYK, LITERAL_DEVICE_GRAY, @@ -14,6 +11,9 @@ LITERAL_INLINE_DEVICE_GRAY, LITERAL_INLINE_DEVICE_RGB, ) +from playa.exceptions import PDFValueError +from playa.jbig2 import JBIG2StreamReader, JBIG2StreamWriter +from playa.layout import LTImage from playa.pdftypes import ( LITERALS_DCT_DECODE, LITERALS_FLATE_DECODE, diff --git a/playa/layout.py b/playa/layout.py index 8eed5ae3..ccb96936 100644 --- a/playa/layout.py +++ b/playa/layout.py @@ -11,8 +11,8 @@ cast, ) -from playa.exceptions import PDFValueError from playa.color import PDFColorSpace +from playa.exceptions import PDFValueError from playa.font import PDFFont from playa.pdftypes import ContentStream from playa.utils import ( diff --git a/playa/page.py b/playa/page.py index f6be866c..42f8a80e 100644 --- a/playa/page.py +++ b/playa/page.py @@ -15,6 +15,7 @@ from playa import settings from playa.casting import safe_float +from playa.color import PREDEFINED_COLORSPACE, PDFColorSpace from playa.exceptions import ( PDFInterpreterError, PDFSyntaxError, @@ -22,6 +23,7 @@ PDFValueError, PSTypeError, ) +from playa.font import PDFFont from playa.layout import ( Color, LTChar, @@ -35,31 +37,23 @@ LTRect, PDFGraphicState, ) -from playa.color import PREDEFINED_COLORSPACE, PDFColorSpace -from playa.font import ( - PDFFont, -) +from playa.parser import Parser, PSBaseParserToken, PSStackType from playa.pdftypes import ( + KWD, + LIT, LITERALS_ASCII85_DECODE, - ObjRef, ContentStream, + ObjRef, + PSKeyword, + PSLiteral, dict_value, int_value, + keyword_name, list_value, + literal_name, resolve1, stream_value, ) -from playa.psparser import ( - KWD, - LIT, - Parser, - PSBaseParserToken, - PSKeyword, - PSLiteral, - PSStackType, - keyword_name, - literal_name, -) from playa.utils import ( MATRIX_IDENTITY, Matrix, diff --git a/playa/parser.py b/playa/parser.py index 0670fb9b..96999fb4 100644 --- a/playa/parser.py +++ b/playa/parser.py @@ -1,20 +1,51 @@ import logging import mmap +import re import weakref -from typing import TYPE_CHECKING, Union +from binascii import unhexlify +from collections import deque +from typing import ( + TYPE_CHECKING, + Deque, + Dict, + Generic, + Iterator, + List, + Optional, + Tuple, + TypeVar, + Union, +) from playa import settings from playa.casting import safe_int -from playa.exceptions import PDFSyntaxError -from playa.pdftypes import ObjRef, ContentStream, dict_value, int_value -from playa.psparser import KWD, Parser, PSKeyword +from playa.exceptions import PDFSyntaxError, PSException, PSSyntaxError, PSTypeError +from playa.pdftypes import ( + KWD, + LIT, + ContentStream, + ObjRef, + PSKeyword, + PSLiteral, + dict_value, + int_value, + literal_name, + name_str, +) +from playa.utils import choplist +log = logging.getLogger(__name__) if TYPE_CHECKING: from playa.document import PDFDocument -log = logging.getLogger(__name__) - -# Important keywords +# Intern a bunch of important keywords +KEYWORD_PROC_BEGIN = KWD(b"{") +KEYWORD_PROC_END = KWD(b"}") +KEYWORD_ARRAY_BEGIN = KWD(b"[") +KEYWORD_ARRAY_END = KWD(b"]") +KEYWORD_DICT_BEGIN = KWD(b"<<") +KEYWORD_DICT_END = KWD(b">>") +KEYWORD_GT = KWD(b">") KEYWORD_R = KWD(b"R") KEYWORD_NULL = KWD(b"null") KEYWORD_ENDOBJ = KWD(b"endobj") @@ -22,6 +53,434 @@ KEYWORD_XREF = KWD(b"xref") KEYWORD_STARTXREF = KWD(b"startxref") KEYWORD_OBJ = KWD(b"obj") +KEYWORD_TRAILER = KWD(b"trailer") + + +EOL = b"\r\n" +WHITESPACE = b" \t\n\r\f\v" +NUMBER = b"0123456789" +HEX = NUMBER + b"abcdef" + b"ABCDEF" +NOTLITERAL = b"#/%[]()<>{}" + WHITESPACE +NOTKEYWORD = b"#/%[]()<>{}" + WHITESPACE +NOTSTRING = b"()\\" +OCTAL = b"01234567" +ESC_STRING = { + b"b": 8, + b"t": 9, + b"n": 10, + b"f": 12, + b"r": 13, + b"(": 40, + b")": 41, + b"\\": 92, +} + + +PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] +LEXER = re.compile( + rb"""(?: + (?P \s+) + | (?P %[^\r\n]*[\r\n]) + | (?P /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ ) + | (?P [-+]? (?: \d*\.\d+ | \d+ ) ) + | (?P [A-Za-z] [^#/%\[\]()<>{}\s]*) + | (?P \([^()\\]*) + | (?P <[A-Fa-f\d\s]*>) + | (?P <<) + | (?P >>) + | (?P .) +) +""", + re.VERBOSE, +) +STRLEXER = re.compile( + rb"""(?: + (?P \\[0-7]{1,3}) + | (?P \\(?:\r\n?|\n)) + | (?P \\.) + | (?P \() + | (?P \)) + | (?P \r\n?|\n) + | (?P .) +)""", + re.VERBOSE, +) +HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])") +EOLR = re.compile(rb"\r\n?|\n") +SPC = re.compile(rb"\s") + + +class Lexer: + """Lexer for PDF data.""" + + def __init__(self, data: Union[bytes, mmap.mmap]) -> None: + self.data = data + self.pos = 0 + self.end = len(data) + self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() + + def seek(self, pos: int) -> None: + """Seek to a position and reinitialize parser state.""" + self.pos = pos + self._curtoken = b"" + self._curtokenpos = 0 + self._tokens.clear() + + def tell(self) -> int: + """Get the current position in the buffer.""" + return self.pos + + def read(self, objlen: int) -> bytes: + """Read data from current position, advancing to the end of + this data.""" + pos = self.pos + self.pos = min(pos + objlen, len(self.data)) + return self.data[pos : self.pos] + + def iter_lines(self) -> Iterator[Tuple[int, bytes]]: + r"""Iterate over lines that end either with \r, \n, or \r\n, + starting at the current position.""" + while self.pos < self.end: + linepos = self.pos + m = EOLR.search(self.data, self.pos) + if m is None: + self.pos = self.end + else: + self.pos = m.end() + yield (linepos, self.data[linepos : self.pos]) + + def reverse_iter_lines(self) -> Iterator[bytes]: + """Iterate backwards over lines starting at the current position. + + This is used to locate the trailers at the end of a file. + """ + endline = self.pos + while True: + nidx = self.data.rfind(b"\n", 0, self.pos) + ridx = self.data.rfind(b"\r", 0, self.pos) + best = max(nidx, ridx) + if best == -1: + yield self.data[:endline] + break + yield self.data[best + 1 : endline] + endline = best + 1 + self.pos = best + if self.pos > 0 and self.data[self.pos - 1 : self.pos + 1] == b"\r\n": + self.pos -= 1 + + def get_inline_data( + self, target: bytes = b"EI", blocksize: int = -1 + ) -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker. + + Returns a tuple of the position of the target in the data and the + data *including* the end of stream marker. Advances the file + pointer to a position after the end of the stream. + + The caller is responsible for removing the end-of-stream if + necessary (this depends on the filter being used) and parsing + the end-of-stream token (likewise) if necessary. + """ + tpos = self.data.find(target, self.pos) + if tpos != -1: + nextpos = tpos + len(target) + result = (tpos, self.data[self.pos : nextpos]) + self.pos = nextpos + return result + return (-1, b"") + + def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over tokens.""" + return self + + def __next__(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising StopIteration when + done.""" + while True: + m = LEXER.match(self.data, self.pos) + if m is None: # can only happen at EOS + raise StopIteration + self._curtokenpos = m.start() + self.pos = m.end() + if m.lastgroup not in ("whitespace", "comment"): # type: ignore + # Okay, we got a token or something + break + self._curtoken = m[0] + if m.lastgroup == "name": # type: ignore + self._curtoken = m[0][1:] + self._curtoken = HEXDIGIT.sub( + lambda x: bytes((int(x[1], 16),)), self._curtoken + ) + tok = LIT(name_str(self._curtoken)) + return (self._curtokenpos, tok) + if m.lastgroup == "number": # type: ignore + if b"." in self._curtoken: + return (self._curtokenpos, float(self._curtoken)) + else: + return (self._curtokenpos, int(self._curtoken)) + if m.lastgroup == "startdict": # type: ignore + return (self._curtokenpos, KEYWORD_DICT_BEGIN) + if m.lastgroup == "enddict": # type: ignore + return (self._curtokenpos, KEYWORD_DICT_END) + if m.lastgroup == "startstr": # type: ignore + return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end()) + if m.lastgroup == "hexstr": # type: ignore + self._curtoken = SPC.sub(b"", self._curtoken[1:-1]) + if len(self._curtoken) % 2 == 1: + self._curtoken += b"0" + return (self._curtokenpos, unhexlify(self._curtoken)) + # Anything else is treated as a keyword (whether explicitly matched or not) + if self._curtoken == b"true": + return (self._curtokenpos, True) + elif self._curtoken == b"false": + return (self._curtokenpos, False) + else: + return (self._curtokenpos, KWD(self._curtoken)) + + def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]: + """Parse the remainder of a string.""" + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts = [EOLR.sub(b"\n", start)] + paren = 1 + for m in STRLEXER.finditer(self.data, pos): + self.pos = m.end() + if m.lastgroup == "parenright": # type: ignore + paren -= 1 + if paren == 0: + # By far the most common situation! + break + parts.append(m[0]) + elif m.lastgroup == "parenleft": # type: ignore + parts.append(m[0]) + paren += 1 + elif m.lastgroup == "escape": # type: ignore + chr = m[0][1:2] + if chr not in ESC_STRING: + log.warning("Unrecognized escape %r", m[0]) + parts.append(chr) + else: + parts.append(bytes((ESC_STRING[chr],))) + elif m.lastgroup == "octal": # type: ignore + chrcode = int(m[0][1:], 8) + if chrcode >= 256: + # PDF1.7 p.16: "high-order overflow shall be + # ignored." + log.warning("Invalid octal %r (%d)", m[0][1:], chrcode) + else: + parts.append(bytes((chrcode,))) + elif m.lastgroup == "newline": # type: ignore + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts.append(b"\n") + elif m.lastgroup == "linebreak": # type: ignore + pass + else: + parts.append(m[0]) + if paren != 0: + log.warning("Unterminated string at %d", pos) + raise StopIteration + return (self._curtokenpos, b"".join(parts)) + + +# Stack slots may by occupied by any of: +# * the name of a literal +# * the PSBaseParserToken types +# * list (via KEYWORD_ARRAY) +# * dict (via KEYWORD_DICT) +# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT +ExtraT = TypeVar("ExtraT") +PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT] +PSStackEntry = Tuple[int, PSStackType[ExtraT]] + + +class Parser(Generic[ExtraT]): + """Basic parser for PDF objects in a bytes-like object.""" + + def __init__(self, data: Union[bytes, mmap.mmap]) -> None: + self.reinit(data) + + def reinit(self, data: Union[bytes, mmap.mmap]) -> None: + """Reinitialize with new data (FIXME: Should go away, use a + new parser for each stream as it's clearer and safer)""" + self._lexer = Lexer(data) + self.reset() + + def reset(self) -> None: + """Reset parser state.""" + self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = [] + self.curtype: Optional[str] = None + self.curstack: List[PSStackEntry[ExtraT]] = [] + self.results: List[PSStackEntry[ExtraT]] = [] + + def push(self, *objs: PSStackEntry[ExtraT]) -> None: + """Push some objects onto the stack.""" + self.curstack.extend(objs) + + def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: + """Pop some objects off the stack.""" + objs = self.curstack[-n:] + self.curstack[-n:] = [] + return objs + + def popall(self) -> List[PSStackEntry[ExtraT]]: + """Pop all the things off the stack.""" + objs = self.curstack + self.curstack = [] + return objs + + def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: + """Move some objects to the output.""" + try: + log.debug("add_results: %r", objs) + except Exception: + log.debug("add_results: (unprintable object)") + self.results.extend(objs) + + def start_type(self, pos: int, type: str) -> None: + """Start a composite object (array, dict, etc).""" + self.context.append((pos, self.curtype, self.curstack)) + (self.curtype, self.curstack) = (type, []) + log.debug("start_type: pos=%r, type=%r", pos, type) + + def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: + """End a composite object (array, dict, etc).""" + if self.curtype != type: + raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}") + objs = [obj for (_, obj) in self.curstack] + (pos, self.curtype, self.curstack) = self.context.pop() + log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs) + return (pos, objs) + + def do_keyword(self, pos: int, token: PSKeyword) -> None: + """Handle a PDF keyword.""" + pass + + def flush(self) -> None: + """Add objects from stack to output (or, actually, not).""" + return + + def __next__(self) -> PSStackEntry[ExtraT]: + """Return the next object, raising StopIteration at EOF. + + Arrays and dictionaries are represented as Python lists and + dictionaries. + """ + while not self.results: + (pos, token) = self.nexttoken() + if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): + # normal token + self.push((pos, token)) + elif token == KEYWORD_ARRAY_BEGIN: + # begin array + self.start_type(pos, "a") + elif token == KEYWORD_ARRAY_END: + # end array + try: + self.push(self.end_type("a")) + except PSTypeError: + if settings.STRICT: + raise + elif token == KEYWORD_DICT_BEGIN: + # begin dictionary + self.start_type(pos, "d") + elif token == KEYWORD_DICT_END: + # end dictionary + try: + (pos, objs) = self.end_type("d") + if len(objs) % 2 != 0: + error_msg = "Invalid dictionary construct: %r" % objs + raise PSSyntaxError(error_msg) + d = { + literal_name(k): v + for (k, v) in choplist(2, objs) + if v is not None + } + self.push((pos, d)) + except PSTypeError: + if settings.STRICT: + raise + elif token == KEYWORD_PROC_BEGIN: + # begin proc + self.start_type(pos, "p") + elif token == KEYWORD_PROC_END: + # end proc + try: + self.push(self.end_type("p")) + except PSTypeError: + if settings.STRICT: + raise + elif isinstance(token, PSKeyword): + log.debug( + "do_keyword: pos=%r, token=%r, stack=%r", + pos, + token, + self.curstack, + ) + self.do_keyword(pos, token) + else: + log.error( + "unknown token: pos=%r, token=%r, stack=%r", + pos, + token, + self.curstack, + ) + self.do_keyword(pos, token) + raise PSException + if self.context: + continue + else: + self.flush() + pos, obj = self.results.pop(0) + try: + log.debug("__next__: object at %d: %r", pos, obj) + except Exception: + log.debug("__next__: (unprintable object) at %d", pos) + return pos, obj + + def __iter__(self) -> Iterator[PSStackEntry[ExtraT]]: + """Iterate over objects, raising StopIteration at EOF.""" + return self + + # Delegation follows + def seek(self, pos: int) -> None: + """Seek to a position and reset parser state.""" + self._lexer.seek(pos) + self.reset() + + def tell(self) -> int: + """Get the current position in the file.""" + return self._lexer.tell() + + @property + def end(self) -> int: + """End (or size) of file, for use with seek().""" + return self._lexer.end + + def iter_lines(self) -> Iterator[Tuple[int, bytes]]: + r"""Iterate over lines that end either with \r, \n, or \r\n.""" + return self._lexer.iter_lines() + + def reverse_iter_lines(self) -> Iterator[bytes]: + """Iterate over lines starting at the end of the file + + This is used to locate the trailers at the end of a file. + """ + return self._lexer.reverse_iter_lines() + + def read(self, objlen: int) -> bytes: + """Read data from a specified position, moving the current + position to the end of this data.""" + return self._lexer.read(objlen) + + def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker.""" + return self._lexer.get_inline_data(target) + + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising StopIteration when + done.""" + return next(self._lexer) # PDFParser stack holds all the base types plus ContentStream, ObjRef, and None diff --git a/playa/pdftypes.py b/playa/pdftypes.py index f703ee80..f52d9b42 100644 --- a/playa/pdftypes.py +++ b/playa/pdftypes.py @@ -6,11 +6,14 @@ TYPE_CHECKING, Any, Dict, + Generic, Iterable, List, Optional, Protocol, Tuple, + Type, + TypeVar, Union, cast, ) @@ -23,9 +26,9 @@ PDFNotImplementedError, PDFTypeError, PDFValueError, + PSTypeError, ) from playa.lzw import lzwdecode -from playa.psparser import LIT from playa.runlength import rldecode from playa.utils import apply_png_predictor @@ -34,8 +37,74 @@ logger = logging.getLogger(__name__) -LITERAL_CRYPT = LIT("Crypt") +class PSLiteral: + """A class that represents a PostScript literal. + + Postscript literals are used as identifiers, such as + variable names, property names and dictionary keys. + Literals are case sensitive and denoted by a preceding + slash sign (e.g. "/Name") + + Note: Do not create an instance of PSLiteral directly. + Always use PSLiteralTable.intern(). + """ + + def __init__(self, name: str) -> None: + self.name = name + + def __repr__(self) -> str: + return "/%r" % self.name + + +class PSKeyword: + """A class that represents a PostScript keyword. + + PostScript keywords are a dozen of predefined words. + Commands and directives in PostScript are expressed by keywords. + They are also used to denote the content boundaries. + + Note: Do not create an instance of PSKeyword directly. + Always use PSKeywordTable.intern(). + """ + + def __init__(self, name: bytes) -> None: + self.name = name + + def __repr__(self) -> str: + return "/%r" % self.name + + +_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword) +_NameT = TypeVar("_NameT", str, bytes) + + +class PSSymbolTable(Generic[_SymbolT, _NameT]): + """Store globally unique name objects or language keywords.""" + + def __init__(self, table_type: Type[_SymbolT], name_type: Type[_NameT]) -> None: + self.dict: Dict[_NameT, _SymbolT] = {} + self.table_type: Type[_SymbolT] = table_type + self.name_type: Type[_NameT] = name_type + + def intern(self, name: _NameT) -> _SymbolT: + if not isinstance(name, self.name_type): + raise ValueError(f"{self.table_type} can only store {self.name_type}") + if name in self.dict: + lit = self.dict[name] + else: + lit = self.table_type(name) # type: ignore + self.dict[name] = lit + return lit + + +PSLiteralTable = PSSymbolTable(PSLiteral, str) +PSKeywordTable = PSSymbolTable(PSKeyword, bytes) +LIT = PSLiteralTable.intern +KWD = PSKeywordTable.intern + +# Intern a bunch of important literals +LITERAL_CRYPT = LIT("Crypt") # Abbreviation of Filter names in PDF 4.8.6. "Inline Images" LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl")) LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW")) @@ -48,6 +117,51 @@ LITERALS_JPX_DECODE = (LIT("JPXDecode"),) +def name_str(x: bytes) -> str: + """Get the string representation for a name object. + + According to the PDF 1.7 spec (p.18): + + > Ordinarily, the bytes making up the name are never treated as + > text to be presented to a human user or to an application + > external to a conforming reader. However, occasionally the need + > arises to treat a name object as text... In such situations, the + > sequence of bytes (after expansion of NUMBER SIGN sequences, if + > any) should be interpreted according to UTF-8. + + Accordingly, if they *can* be decoded to UTF-8, then they *will* + be, and if not, we will just decode them as ISO-8859-1 since that + gives a unique (if possibly nonsensical) value for an 8-bit string. + """ + try: + return x.decode("utf-8") + except UnicodeDecodeError: + return x.decode("iso-8859-1") + + +def literal_name(x: Any) -> str: + if not isinstance(x, PSLiteral): + if settings.STRICT: + raise PSTypeError(f"Literal required: {x!r}") + return str(x) + else: + return x.name + + +def keyword_name(x: Any) -> str: + if not isinstance(x, PSKeyword): + if settings.STRICT: + raise PSTypeError("Keyword required: %r" % x) + else: + return str(x) + else: + # PDF keywords are *not* UTF-8 (they aren't ISO-8859-1 either, + # but this isn't very important, we just want some + # unique representation of 8-bit characters, as above) + name = x.name.decode("iso-8859-1") + return name + + class DecipherCallable(Protocol): """Fully typed a decipher callback, with optional parameter.""" diff --git a/playa/psparser.py b/playa/psparser.py deleted file mode 100755 index 103ceee9..00000000 --- a/playa/psparser.py +++ /dev/null @@ -1,574 +0,0 @@ -#!/usr/bin/env python3 -import logging -import mmap -import re -from binascii import unhexlify -from collections import deque -from typing import ( - Any, - Deque, - Dict, - Generic, - Iterator, - List, - Optional, - Tuple, - Type, - TypeVar, - Union, -) - -from playa import settings -from playa.exceptions import ( - PSException, - PSSyntaxError, - PSTypeError, -) -from playa.utils import choplist - -log = logging.getLogger(__name__) - - -class PSLiteral: - """A class that represents a PostScript literal. - - Postscript literals are used as identifiers, such as - variable names, property names and dictionary keys. - Literals are case sensitive and denoted by a preceding - slash sign (e.g. "/Name") - - Note: Do not create an instance of PSLiteral directly. - Always use PSLiteralTable.intern(). - """ - - def __init__(self, name: str) -> None: - self.name = name - - def __repr__(self) -> str: - return "/%r" % self.name - - -class PSKeyword: - """A class that represents a PostScript keyword. - - PostScript keywords are a dozen of predefined words. - Commands and directives in PostScript are expressed by keywords. - They are also used to denote the content boundaries. - - Note: Do not create an instance of PSKeyword directly. - Always use PSKeywordTable.intern(). - """ - - def __init__(self, name: bytes) -> None: - self.name = name - - def __repr__(self) -> str: - return "/%r" % self.name - - -_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword) -_NameT = TypeVar("_NameT", str, bytes) - - -class PSSymbolTable(Generic[_SymbolT, _NameT]): - """Store globally unique name objects or language keywords.""" - - def __init__(self, table_type: Type[_SymbolT], name_type: Type[_NameT]) -> None: - self.dict: Dict[_NameT, _SymbolT] = {} - self.table_type: Type[_SymbolT] = table_type - self.name_type: Type[_NameT] = name_type - - def intern(self, name: _NameT) -> _SymbolT: - if not isinstance(name, self.name_type): - raise ValueError(f"{self.table_type} can only store {self.name_type}") - if name in self.dict: - lit = self.dict[name] - else: - lit = self.table_type(name) # type: ignore - self.dict[name] = lit - return lit - - -PSLiteralTable = PSSymbolTable(PSLiteral, str) -PSKeywordTable = PSSymbolTable(PSKeyword, bytes) -LIT = PSLiteralTable.intern -KWD = PSKeywordTable.intern -KEYWORD_PROC_BEGIN = KWD(b"{") -KEYWORD_PROC_END = KWD(b"}") -KEYWORD_ARRAY_BEGIN = KWD(b"[") -KEYWORD_ARRAY_END = KWD(b"]") -KEYWORD_DICT_BEGIN = KWD(b"<<") -KEYWORD_DICT_END = KWD(b">>") -KEYWORD_GT = KWD(b">") - - -def name_str(x: bytes) -> str: - """Get the string representation for a name object. - - According to the PDF 1.7 spec (p.18): - - > Ordinarily, the bytes making up the name are never treated as - > text to be presented to a human user or to an application - > external to a conforming reader. However, occasionally the need - > arises to treat a name object as text... In such situations, the - > sequence of bytes (after expansion of NUMBER SIGN sequences, if - > any) should be interpreted according to UTF-8. - - Accordingly, if they *can* be decoded to UTF-8, then they *will* - be, and if not, we will just decode them as ISO-8859-1 since that - gives a unique (if possibly nonsensical) value for an 8-bit string. - """ - try: - return x.decode("utf-8") - except UnicodeDecodeError: - return x.decode("iso-8859-1") - - -def literal_name(x: Any) -> str: - if not isinstance(x, PSLiteral): - if settings.STRICT: - raise PSTypeError(f"Literal required: {x!r}") - return str(x) - else: - return x.name - - -def keyword_name(x: Any) -> str: - if not isinstance(x, PSKeyword): - if settings.STRICT: - raise PSTypeError("Keyword required: %r" % x) - else: - return str(x) - else: - # PDF keywords are *not* UTF-8 (they aren't ISO-8859-1 either, - # but this isn't very important, we just want some - # unique representation of 8-bit characters, as above) - name = x.name.decode("iso-8859-1") - return name - - -EOL = b"\r\n" -WHITESPACE = b" \t\n\r\f\v" -NUMBER = b"0123456789" -HEX = NUMBER + b"abcdef" + b"ABCDEF" -NOTLITERAL = b"#/%[]()<>{}" + WHITESPACE -NOTKEYWORD = b"#/%[]()<>{}" + WHITESPACE -NOTSTRING = b"()\\" -OCTAL = b"01234567" -ESC_STRING = { - b"b": 8, - b"t": 9, - b"n": 10, - b"f": 12, - b"r": 13, - b"(": 40, - b")": 41, - b"\\": 92, -} - - -PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] -LEXER = re.compile( - rb"""(?: - (?P \s+) - | (?P %[^\r\n]*[\r\n]) - | (?P /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ ) - | (?P [-+]? (?: \d*\.\d+ | \d+ ) ) - | (?P [A-Za-z] [^#/%\[\]()<>{}\s]*) - | (?P \([^()\\]*) - | (?P <[A-Fa-f\d\s]*>) - | (?P <<) - | (?P >>) - | (?P .) -) -""", - re.VERBOSE, -) -STRLEXER = re.compile( - rb"""(?: - (?P \\[0-7]{1,3}) - | (?P \\(?:\r\n?|\n)) - | (?P \\.) - | (?P \() - | (?P \)) - | (?P \r\n?|\n) - | (?P .) -)""", - re.VERBOSE, -) -HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])") -EOLR = re.compile(rb"\r\n?|\n") -SPC = re.compile(rb"\s") - - -class Lexer: - """Lexer for PDF data.""" - - def __init__(self, data: Union[bytes, mmap.mmap]) -> None: - self.data = data - self.pos = 0 - self.end = len(data) - self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() - - def seek(self, pos: int) -> None: - """Seek to a position and reinitialize parser state.""" - self.pos = pos - self._curtoken = b"" - self._curtokenpos = 0 - self._tokens.clear() - - def tell(self) -> int: - """Get the current position in the buffer.""" - return self.pos - - def read(self, objlen: int) -> bytes: - """Read data from current position, advancing to the end of - this data.""" - pos = self.pos - self.pos = min(pos + objlen, len(self.data)) - return self.data[pos : self.pos] - - def iter_lines(self) -> Iterator[Tuple[int, bytes]]: - r"""Iterate over lines that end either with \r, \n, or \r\n, - starting at the current position.""" - while self.pos < self.end: - linepos = self.pos - m = EOLR.search(self.data, self.pos) - if m is None: - self.pos = self.end - else: - self.pos = m.end() - yield (linepos, self.data[linepos : self.pos]) - - def reverse_iter_lines(self) -> Iterator[bytes]: - """Iterate backwards over lines starting at the current position. - - This is used to locate the trailers at the end of a file. - """ - endline = self.pos - while True: - nidx = self.data.rfind(b"\n", 0, self.pos) - ridx = self.data.rfind(b"\r", 0, self.pos) - best = max(nidx, ridx) - if best == -1: - yield self.data[:endline] - break - yield self.data[best + 1 : endline] - endline = best + 1 - self.pos = best - if self.pos > 0 and self.data[self.pos - 1 : self.pos + 1] == b"\r\n": - self.pos -= 1 - - def get_inline_data( - self, target: bytes = b"EI", blocksize: int = -1 - ) -> Tuple[int, bytes]: - """Get the data for an inline image up to the target - end-of-stream marker. - - Returns a tuple of the position of the target in the data and the - data *including* the end of stream marker. Advances the file - pointer to a position after the end of the stream. - - The caller is responsible for removing the end-of-stream if - necessary (this depends on the filter being used) and parsing - the end-of-stream token (likewise) if necessary. - """ - tpos = self.data.find(target, self.pos) - if tpos != -1: - nextpos = tpos + len(target) - result = (tpos, self.data[self.pos : nextpos]) - self.pos = nextpos - return result - return (-1, b"") - - def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: - """Iterate over tokens.""" - return self - - def __next__(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising StopIteration when - done.""" - while True: - m = LEXER.match(self.data, self.pos) - if m is None: # can only happen at EOS - raise StopIteration - self._curtokenpos = m.start() - self.pos = m.end() - if m.lastgroup not in ("whitespace", "comment"): # type: ignore - # Okay, we got a token or something - break - self._curtoken = m[0] - if m.lastgroup == "name": # type: ignore - self._curtoken = m[0][1:] - self._curtoken = HEXDIGIT.sub( - lambda x: bytes((int(x[1], 16),)), self._curtoken - ) - tok = LIT(name_str(self._curtoken)) - return (self._curtokenpos, tok) - if m.lastgroup == "number": # type: ignore - if b"." in self._curtoken: - return (self._curtokenpos, float(self._curtoken)) - else: - return (self._curtokenpos, int(self._curtoken)) - if m.lastgroup == "startdict": # type: ignore - return (self._curtokenpos, KEYWORD_DICT_BEGIN) - if m.lastgroup == "enddict": # type: ignore - return (self._curtokenpos, KEYWORD_DICT_END) - if m.lastgroup == "startstr": # type: ignore - return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end()) - if m.lastgroup == "hexstr": # type: ignore - self._curtoken = SPC.sub(b"", self._curtoken[1:-1]) - if len(self._curtoken) % 2 == 1: - self._curtoken += b"0" - return (self._curtokenpos, unhexlify(self._curtoken)) - # Anything else is treated as a keyword (whether explicitly matched or not) - if self._curtoken == b"true": - return (self._curtokenpos, True) - elif self._curtoken == b"false": - return (self._curtokenpos, False) - else: - return (self._curtokenpos, KWD(self._curtoken)) - - def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]: - """Parse the remainder of a string.""" - # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) - parts = [EOLR.sub(b"\n", start)] - paren = 1 - for m in STRLEXER.finditer(self.data, pos): - self.pos = m.end() - if m.lastgroup == "parenright": # type: ignore - paren -= 1 - if paren == 0: - # By far the most common situation! - break - parts.append(m[0]) - elif m.lastgroup == "parenleft": # type: ignore - parts.append(m[0]) - paren += 1 - elif m.lastgroup == "escape": # type: ignore - chr = m[0][1:2] - if chr not in ESC_STRING: - log.warning("Unrecognized escape %r", m[0]) - parts.append(chr) - else: - parts.append(bytes((ESC_STRING[chr],))) - elif m.lastgroup == "octal": # type: ignore - chrcode = int(m[0][1:], 8) - if chrcode >= 256: - # PDF1.7 p.16: "high-order overflow shall be - # ignored." - log.warning("Invalid octal %r (%d)", m[0][1:], chrcode) - else: - parts.append(bytes((chrcode,))) - elif m.lastgroup == "newline": # type: ignore - # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) - parts.append(b"\n") - elif m.lastgroup == "linebreak": # type: ignore - pass - else: - parts.append(m[0]) - if paren != 0: - log.warning("Unterminated string at %d", pos) - raise StopIteration - return (self._curtokenpos, b"".join(parts)) - - -# Stack slots may by occupied by any of: -# * the name of a literal -# * the PSBaseParserToken types -# * list (via KEYWORD_ARRAY) -# * dict (via KEYWORD_DICT) -# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT -ExtraT = TypeVar("ExtraT") -PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT] -PSStackEntry = Tuple[int, PSStackType[ExtraT]] - - -class Parser(Generic[ExtraT]): - """Basic parser for PDF objects in a bytes-like object.""" - - def __init__(self, data: Union[bytes, mmap.mmap]) -> None: - self.reinit(data) - - def reinit(self, data: Union[bytes, mmap.mmap]) -> None: - """Reinitialize with new data (FIXME: Should go away, use a - new parser for each stream as it's clearer and safer)""" - self._lexer = Lexer(data) - self.reset() - - def reset(self) -> None: - """Reset parser state.""" - self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = [] - self.curtype: Optional[str] = None - self.curstack: List[PSStackEntry[ExtraT]] = [] - self.results: List[PSStackEntry[ExtraT]] = [] - - def push(self, *objs: PSStackEntry[ExtraT]) -> None: - """Push some objects onto the stack.""" - self.curstack.extend(objs) - - def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: - """Pop some objects off the stack.""" - objs = self.curstack[-n:] - self.curstack[-n:] = [] - return objs - - def popall(self) -> List[PSStackEntry[ExtraT]]: - """Pop all the things off the stack.""" - objs = self.curstack - self.curstack = [] - return objs - - def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: - """Move some objects to the output.""" - try: - log.debug("add_results: %r", objs) - except Exception: - log.debug("add_results: (unprintable object)") - self.results.extend(objs) - - def start_type(self, pos: int, type: str) -> None: - """Start a composite object (array, dict, etc).""" - self.context.append((pos, self.curtype, self.curstack)) - (self.curtype, self.curstack) = (type, []) - log.debug("start_type: pos=%r, type=%r", pos, type) - - def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: - """End a composite object (array, dict, etc).""" - if self.curtype != type: - raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}") - objs = [obj for (_, obj) in self.curstack] - (pos, self.curtype, self.curstack) = self.context.pop() - log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs) - return (pos, objs) - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - """Handle a PDF keyword.""" - pass - - def flush(self) -> None: - """Add objects from stack to output (or, actually, not).""" - return - - def __next__(self) -> PSStackEntry[ExtraT]: - """Return the next object, raising StopIteration at EOF. - - Arrays and dictionaries are represented as Python lists and - dictionaries. - """ - while not self.results: - (pos, token) = self.nexttoken() - if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): - # normal token - self.push((pos, token)) - elif token == KEYWORD_ARRAY_BEGIN: - # begin array - self.start_type(pos, "a") - elif token == KEYWORD_ARRAY_END: - # end array - try: - self.push(self.end_type("a")) - except PSTypeError: - if settings.STRICT: - raise - elif token == KEYWORD_DICT_BEGIN: - # begin dictionary - self.start_type(pos, "d") - elif token == KEYWORD_DICT_END: - # end dictionary - try: - (pos, objs) = self.end_type("d") - if len(objs) % 2 != 0: - error_msg = "Invalid dictionary construct: %r" % objs - raise PSSyntaxError(error_msg) - d = { - literal_name(k): v - for (k, v) in choplist(2, objs) - if v is not None - } - self.push((pos, d)) - except PSTypeError: - if settings.STRICT: - raise - elif token == KEYWORD_PROC_BEGIN: - # begin proc - self.start_type(pos, "p") - elif token == KEYWORD_PROC_END: - # end proc - try: - self.push(self.end_type("p")) - except PSTypeError: - if settings.STRICT: - raise - elif isinstance(token, PSKeyword): - log.debug( - "do_keyword: pos=%r, token=%r, stack=%r", - pos, - token, - self.curstack, - ) - self.do_keyword(pos, token) - else: - log.error( - "unknown token: pos=%r, token=%r, stack=%r", - pos, - token, - self.curstack, - ) - self.do_keyword(pos, token) - raise PSException - if self.context: - continue - else: - self.flush() - pos, obj = self.results.pop(0) - try: - log.debug("__next__: object at %d: %r", pos, obj) - except Exception: - log.debug("__next__: (unprintable object) at %d", pos) - return pos, obj - - def __iter__(self) -> Iterator[PSStackEntry[ExtraT]]: - """Iterate over objects, raising StopIteration at EOF.""" - return self - - # Delegation follows - def seek(self, pos: int) -> None: - """Seek to a position and reset parser state.""" - self._lexer.seek(pos) - self.reset() - - def tell(self) -> int: - """Get the current position in the file.""" - return self._lexer.tell() - - @property - def end(self) -> int: - """End (or size) of file, for use with seek().""" - return self._lexer.end - - def iter_lines(self) -> Iterator[Tuple[int, bytes]]: - r"""Iterate over lines that end either with \r, \n, or \r\n.""" - return self._lexer.iter_lines() - - def reverse_iter_lines(self) -> Iterator[bytes]: - """Iterate over lines starting at the end of the file - - This is used to locate the trailers at the end of a file. - """ - return self._lexer.reverse_iter_lines() - - def read(self, objlen: int) -> bytes: - """Read data from a specified position, moving the current - position to the end of this data.""" - return self._lexer.read(objlen) - - def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]: - """Get the data for an inline image up to the target - end-of-stream marker.""" - return self._lexer.get_inline_data(target) - - def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising StopIteration when - done.""" - return next(self._lexer) diff --git a/playa/structtree.py b/playa/structtree.py index 504103e9..016f0eff 100644 --- a/playa/structtree.py +++ b/playa/structtree.py @@ -18,9 +18,8 @@ from playa.data_structures import NumberTree from playa.exceptions import PDFNoStructTree from playa.page import PDFPage -from playa.parser import KEYWORD_NULL +from playa.parser import KEYWORD_NULL, PSLiteral from playa.pdftypes import ObjRef, resolve1 -from playa.psparser import PSLiteral from playa.utils import decode_text logger = logging.getLogger(__name__) diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py index 044fe584..f67385d3 100644 --- a/tests/benchmark_parser.py +++ b/tests/benchmark_parser.py @@ -272,7 +272,7 @@ def bench_bytes(): - from playa.psparser import Lexer + from playa.parser import Lexer runs = 100 start = time.time() @@ -286,7 +286,7 @@ def bench_bytes(): def bench_mmap(): import mmap - from playa.psparser import Lexer + from playa.parser import Lexer with tempfile.NamedTemporaryFile() as tf: runs = 100 diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 646221be..7dd1e2f3 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -10,8 +10,8 @@ import playa import playa.settings from playa.data_structures import NameTree -from playa.exceptions import PDFSyntaxError from playa.document import read_header +from playa.exceptions import PDFSyntaxError from playa.utils import decode_text playa.settings.STRICT = True diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index bca21c37..e603fa05 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -1,11 +1,6 @@ -import logging -from typing import Any, List, Tuple from pathlib import Path -import pytest - -from playa.parser import PDFParser, ContentStream, ContentStreamParser - +from playa.parser import PDFParser TESTDIR = Path(__file__).parent.parent / "samples" @@ -13,6 +8,7 @@ class MockDoc: def __call__(self): return self + decipher = None diff --git a/tests/test_psparser.py b/tests/test_psparser.py index 6b5d3a2f..ce05d9f1 100644 --- a/tests/test_psparser.py +++ b/tests/test_psparser.py @@ -3,13 +3,15 @@ import pytest -from playa.psparser import ( +from playa.parser import ( KEYWORD_DICT_BEGIN, KEYWORD_DICT_END, - KWD, - LIT, Lexer, Parser, +) +from playa.pdftypes import ( + KWD, + LIT, keyword_name, literal_name, )