From 37f3b2e183091ebaddceb4dd051bbc98c022c70d Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 18 Sep 2024 22:30:25 -0400 Subject: [PATCH] Implement in-memory parsing (#6) * refactor: put get_inline_data where it can be overridden * fix: handle case where eos not found * feat: in-memory lexer which is Very Fast * test: fully test both parsers * fix: make it work again * fix: eliminate inheritance * feat: accept bytes directly * chore: format * fix: simplify and fix PDFContentParser * feat: implement switch between file/memory parsers --- playa/cmapdb.py | 5 +- playa/pdffont.py | 10 +- playa/pdfinterp.py | 122 +++++--------- playa/pdfparser.py | 10 +- playa/psparser.py | 334 ++++++++++++++++++++++++++++++++++++-- tests/benchmark_parser.py | 41 ++++- tests/test_open.py | 12 +- tests/test_pdfparser.py | 246 ++++++++++++++++------------ 8 files changed, 569 insertions(+), 211 deletions(-) diff --git a/playa/cmapdb.py b/playa/cmapdb.py index 43e7708f..bde0dde3 100644 --- a/playa/cmapdb.py +++ b/playa/cmapdb.py @@ -18,7 +18,6 @@ import sys from typing import ( Any, - BinaryIO, Dict, Iterable, Iterator, @@ -277,8 +276,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: class CMapParser(PSStackParser[PSKeyword]): - def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: - PSStackParser.__init__(self, fp) + def __init__(self, cmap: CMapBase, data: bytes) -> None: + super().__init__(data) self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True diff --git a/playa/pdffont.py b/playa/pdffont.py index 2399e129..5ad3b329 100644 --- a/playa/pdffont.py +++ b/playa/pdffont.py @@ -114,8 +114,8 @@ class Type1FontHeaderParser(PSStackParser[int]): KEYWORD_READONLY = KWD(b"readonly") KEYWORD_FOR = KWD(b"for") - def __init__(self, data: BinaryIO) -> None: - PSStackParser.__init__(self, data) + def __init__(self, data: bytes) -> None: + super().__init__(data) self._cid2unicode: Dict[int, str] = {} def get_encoding(self) -> Dict[int, str]: @@ -968,7 +968,7 @@ def __init__( if "ToUnicode" in spec: strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() - CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() + CMapParser(self.unicode_map, strm.get_data()).run() PDFFont.__init__(self, descriptor, widths) def to_unichr(self, cid: int) -> str: @@ -1008,7 +1008,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No self.fontfile = stream_value(descriptor.get("FontFile")) length1 = int_value(self.fontfile["Length1"]) data = self.fontfile.get_data()[:length1] - parser = Type1FontHeaderParser(BytesIO(data)) + parser = Type1FontHeaderParser(data) self.cid2unicode = parser.get_encoding() def __repr__(self) -> str: @@ -1079,7 +1079,7 @@ def __init__( if isinstance(spec["ToUnicode"], PDFStream): strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() - CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() + CMapParser(self.unicode_map, strm.get_data()).run() else: cmap_name = literal_name(spec["ToUnicode"]) encoding = literal_name(spec["Encoding"]) diff --git a/playa/pdfinterp.py b/playa/pdfinterp.py index 471092b7..52b19932 100644 --- a/playa/pdfinterp.py +++ b/playa/pdfinterp.py @@ -1,12 +1,16 @@ -import io import logging -from io import BytesIO -from typing import BinaryIO, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast +from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast from playa import settings from playa.casting import safe_float from playa.cmapdb import CMap, CMapBase, CMapDB -from playa.exceptions import PSEOF, PDFException, PDFValueError, PSTypeError +from playa.exceptions import ( + PSEOF, + PDFException, + PDFSyntaxError, + PDFValueError, + PSTypeError, +) from playa.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace from playa.pdfdevice import PDFDevice, PDFTextSeq from playa.pdffont import ( @@ -30,6 +34,7 @@ from playa.psparser import ( KWD, LIT, + PSBaseParserToken, PSKeyword, PSLiteral, PSStackParser, @@ -252,85 +257,38 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: KEYWORD_EI = KWD(b"EI") -def get_inline_data( - fp: BinaryIO, target: bytes = b"EI", blocksize: int = 4096 -) -> Tuple[int, bytes]: - """Get the data for an inline image up to the target - end-of-stream marker. - - Returns a tuple of the position of the target in the data and the - data *including* the end of stream marker. Advances the file - pointer to a position after the end of the stream. - - The caller is responsible for removing the end-of-stream if - necessary (this depends on the filter being used) and parsing - the end-of-stream token (likewise) if necessary. +class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): + """Parse the concatenation of multiple content streams, as + described in the spec (PDF 1.7, p.86): + + ...the effect shall be as if all of the streams in the array were + concatenated, in order, to form a single stream. Conforming + writers can create image objects and other resources as they + occur, even though they interrupt the content stream. The division + between streams may occur only at the boundaries between lexical + tokens (see 7.2, "Lexical Conventions") but shall be unrelated to + the page’s logical content or organization. """ - # PDF 1.7, p. 216: The bytes between the ID and EI operators - # shall be treated the same as a stream object’s data (see - # 7.3.8, "Stream Objects"), even though they do not follow the - # standard stream syntax. - data = [] # list of blocks - partial = b"" # partially seen target - pos = 0 - while True: - # Did we see part of the target at the end of the last - # block? Then scan ahead and try to find the rest (we - # assume the stream is buffered) - if partial: - extra_len = len(target) - len(partial) - extra = fp.read(extra_len) - if partial + extra == target: - pos -= len(partial) - data.append(extra) - break - # Put it back (assume buffering!) - fp.seek(-extra_len, io.SEEK_CUR) - partial = b"" - # Fall through (the target could be at the beginning) - buf = fp.read(blocksize) - tpos = buf.find(target) - if tpos != -1: - data.append(buf[: tpos + len(target)]) - # Put the extra back (assume buffering!) - fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR) - pos += tpos - break - else: - pos += len(buf) - # look for the longest partial match at the end - plen = len(target) - 1 - while plen > 0: - ppos = len(buf) - plen - if buf[ppos:] == target[:plen]: - partial = buf[ppos:] - break - plen -= 1 - data.append(buf) - return (pos, b"".join(data)) - -class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): def __init__(self, streams: Sequence[object]) -> None: - self.streams = streams - self.istream = 0 - # PSStackParser.__init__(fp=None) is safe only because we've overloaded - # all the methods that would attempt to access self.fp without first - # calling self.fillfp(). - PSStackParser.__init__(self, None) # type: ignore[arg-type] - - def fillfp(self) -> None: - if not self.fp: - if self.istream < len(self.streams): - strm = stream_value(self.streams[self.istream]) - self.istream += 1 - else: - raise PSEOF("Unexpected EOF, file truncated?") - self.fp = BytesIO(strm.get_data()) + self.streamiter = iter(streams) + try: + stream = stream_value(next(self.streamiter)) + except StopIteration: + raise PSEOF + log.debug("PDFContentParser starting stream %r", stream) + super().__init__(stream.get_data()) - def seek(self, pos: int) -> None: - self.fillfp() - super().seek(pos) + def __next__(self) -> Tuple[int, PSBaseParserToken]: + while True: + try: + return super().__next__() + except StopIteration: + # Will also raise StopIteration if there are no more, + # which is exactly what we want + stream = stream_value(next(self.streamiter)) + log.debug("PDFContentParser starting stream %r", stream) + self.reinit(stream.get_data()) def flush(self) -> None: self.add_results(*self.popall()) @@ -360,7 +318,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: # interpreted as the first byte of image data. if eos == b"EI": self.seek(pos + len(token.name) + 1) - (pos, data) = get_inline_data(self.fp, target=eos) + (pos, data) = self.get_inline_data(target=eos) # FIXME: it is totally unspecified what to do with # a newline between the end of the data and "EI", # since there is no explicit stream length. (PDF @@ -371,7 +329,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: data = data[: -len(eos)] else: self.seek(pos + len(token.name)) - (pos, data) = get_inline_data(self.fp, target=eos) + (pos, data) = self.get_inline_data(target=eos) + if pos == -1: + raise PDFSyntaxError("End of inline stream %r not found" % eos) obj = PDFStream(d, data) self.push((pos, obj)) # This was included in the data but we need to "parse" it diff --git a/playa/pdfparser.py b/playa/pdfparser.py index 3603c79f..d5057238 100644 --- a/playa/pdfparser.py +++ b/playa/pdfparser.py @@ -1,5 +1,4 @@ import logging -from io import BytesIO from typing import TYPE_CHECKING, BinaryIO, Optional, Union from playa import settings @@ -40,8 +39,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): """ - def __init__(self, fp: BinaryIO) -> None: - PSStackParser.__init__(self, fp) + def __init__(self, data: Union[BinaryIO, bytes]) -> None: + super().__init__(data) self.doc: Optional[PDFDocument] = None self.fallback = False @@ -89,8 +88,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: raise PDFSyntaxError("Unexpected EOF") return pos += len(line) - self.fp.seek(pos) - data = bytearray(self.fp.read(objlen)) + data = bytearray(self.read(pos, objlen)) self.seek(pos + objlen) while True: try: @@ -135,7 +133,7 @@ class PDFStreamParser(PDFParser): """ def __init__(self, data: bytes) -> None: - PDFParser.__init__(self, BytesIO(data)) + super().__init__(data) def flush(self) -> None: self.add_results(*self.popall()) diff --git a/playa/psparser.py b/playa/psparser.py index 1eb6d990..1c4e9431 100755 --- a/playa/psparser.py +++ b/playa/psparser.py @@ -139,7 +139,6 @@ def keyword_name(x: Any) -> Any: EOL = b"\r\n" -SPC = re.compile(rb"\s") WHITESPACE = b" \t\n\r\f\v" NUMBER = b"0123456789" HEX = NUMBER + b"abcdef" + b"ABCDEF" @@ -162,14 +161,19 @@ def keyword_name(x: Any) -> Any: PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] -class PSBaseParser: +class PSFileParser: + """ + Parser (actually a lexer) for PDF data from a buffered file object. + """ + def __init__(self, fp: BinaryIO): self.fp = fp self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() self.seek(0) - def flush(self) -> None: - pass + def reinit(self, fp: BinaryIO): + self.fp = fp + self.seek(0) def seek(self, pos: int) -> None: self.fp.seek(pos) @@ -181,6 +185,10 @@ def seek(self, pos: int) -> None: def tell(self) -> int: return self.fp.tell() + def read(self, pos: int, objlen: int) -> bytes: + self.fp.seek(pos) + return self.fp.read(objlen) + def nextline(self) -> Tuple[int, bytes]: r"""Fetches a next line that ends either with \r, \n, or \r\n.""" linepos = self.fp.tell() @@ -227,6 +235,65 @@ def revreadlines(self) -> Iterator[bytes]: buf = c + buf yield buf + def get_inline_data( + self, target: bytes = b"EI", blocksize: int = 4096 + ) -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker. + + Returns a tuple of the position of the target in the data and the + data *including* the end of stream marker. Advances the file + pointer to a position after the end of the stream. + + The caller is responsible for removing the end-of-stream if + necessary (this depends on the filter being used) and parsing + the end-of-stream token (likewise) if necessary. + """ + # PDF 1.7, p. 216: The bytes between the ID and EI operators + # shall be treated the same as a stream object’s data (see + # 7.3.8, "Stream Objects"), even though they do not follow the + # standard stream syntax. + data = [] # list of blocks + partial = b"" # partially seen target + pos = 0 + while True: + # Did we see part of the target at the end of the last + # block? Then scan ahead and try to find the rest (we + # assume the stream is buffered) + if partial: + extra_len = len(target) - len(partial) + extra = self.fp.read(extra_len) + if partial + extra == target: + pos -= len(partial) + data.append(extra) + break + # Put it back (assume buffering!) + self.fp.seek(-extra_len, io.SEEK_CUR) + partial = b"" + # Fall through (the target could be at the beginning) + buf = self.fp.read(blocksize) + if not buf: + return (-1, b"") + tpos = buf.find(target) + if tpos != -1: + data.append(buf[: tpos + len(target)]) + # Put the extra back (assume buffering!) + self.fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR) + pos += tpos + break + else: + pos += len(buf) + # look for the longest partial match at the end + plen = len(target) - 1 + while plen > 0: + ppos = len(buf) - plen + if buf[ppos:] == target[:plen]: + partial = buf[ppos:] + break + plen -= 1 + data.append(buf) + return (pos, b"".join(data)) + def __iter__(self): return self @@ -458,7 +525,7 @@ def _parse_string_octal(self): chrcode = int(self.oct, 8) if chrcode >= 256: # PDF1.7 p.16: "high-order overflow shall be ignored." - log.warning("Invalid octal %s (%d)", repr(self.oct), chrcode) + log.warning("Invalid octal %r (%d)", self.oct, chrcode) else: self._curtoken += bytes((chrcode,)) # Back to normal string parsing @@ -509,6 +576,218 @@ def _parse_hexstring(self): return c +LEXER = re.compile( + rb"""(?: + (?P \s+) + | (?P %[^\r\n]*[\r\n]) + | (?P /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ ) + | (?P [-+]? (?: \d*\.\d+ | \d+ ) ) + | (?P [A-Za-z] [^#/%\[\]()<>{}\s]*) + | (?P \([^()\\]*) + | (?P <[A-Fa-f\d\s]+>) + | (?P <<) + | (?P >>) + | (?P .) +) +""", + re.VERBOSE, +) +STRLEXER = re.compile( + rb"""(?: + (?P \\\d{1,3}) + | (?P \\(?:\r\n?|\n)) + | (?P \\.) + | (?P \() + | (?P \)) + | (?P .) +)""", + re.VERBOSE, +) +HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])") +EOLR = re.compile(rb"\r\n?|\n") +SPC = re.compile(rb"\s") + + +class PSInMemoryParser: + """ + Parser for in-memory data streams. + """ + + def __init__(self, data: bytes): + self.data = data + self.pos = 0 + self.end = len(data) + self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() + + def reinit(self, data: bytes): + self.data = data + self.seek(0) + + def seek(self, pos: int) -> None: + self.pos = pos + self._curtoken = b"" + self._curtokenpos = 0 + self._tokens.clear() + + def tell(self) -> int: + return self.pos + + def read(self, pos: int, objlen: int) -> bytes: + self.pos = max(pos + objlen, len(self.data)) + return self.data[pos : self.pos] + + def nextline(self) -> Tuple[int, bytes]: + r"""Fetches a next line that ends either with \r, \n, or \r\n.""" + if self.pos == self.end: + raise PSEOF + linepos = self.pos + m = EOLR.search(self.data, self.pos) + if m is None: + self.pos = self.end + else: + self.pos = m.end() + return (linepos, self.data[linepos : self.pos]) + + def revreadlines(self) -> Iterator[bytes]: + """Fetches a next line backwards. + + This is used to locate the trailers at the end of a file. So, + it isn't actually used in PSInMemoryParser, but is here for + completeness. + """ + endline = pos = self.end + while True: + nidx = self.data.rfind(ord(b"\n"), 0, pos) + ridx = self.data.rfind(ord(b"\r"), 0, pos) + best = max(nidx, ridx) + if best == -1: + yield self.data[:endline] + break + yield self.data[best + 1 : endline] + endline = best + 1 + pos = best + if pos > 0 and self.data[pos - 1 : pos + 1] == b"\r\n": + pos -= 1 + + def get_inline_data( + self, target: bytes = b"EI", blocksize: int = -1 + ) -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker. + + Returns a tuple of the position of the target in the data and the + data *including* the end of stream marker. Advances the file + pointer to a position after the end of the stream. + + The caller is responsible for removing the end-of-stream if + necessary (this depends on the filter being used) and parsing + the end-of-stream token (likewise) if necessary. + """ + tpos = self.data.find(target, self.pos) + if tpos != -1: + nextpos = tpos + len(target) + result = (tpos, self.data[self.pos : nextpos]) + self.pos = nextpos + return result + return (-1, b"") + + def __iter__(self): + return self + + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + try: + return self.__next__() + except StopIteration: + raise PSEOF + + def __next__(self) -> Tuple[int, PSBaseParserToken]: + """Lexer (most of the work is done in regular expressions, but + PDF syntax is not entirely regular due to the use of balanced + parentheses in strings).""" + while True: + m = LEXER.match(self.data, self.pos) + if m is None: # can only happen at EOS + raise StopIteration + self._curtokenpos = m.start() + self.pos = m.end() + if m.lastgroup not in ("whitespace", "comment"): + # Okay, we got a token or something + break + self._curtoken = m[0] + if m.lastgroup == "name": + self._curtoken = m[0][1:] + self._curtoken = HEXDIGIT.sub( + lambda x: bytes((int(x[1], 16),)), self._curtoken + ) + try: + tok = LIT(self._curtoken.decode("utf-8")) + except UnicodeDecodeError: + tok = LIT(self._curtoken) + return (self._curtokenpos, tok) + if m.lastgroup == "number": + if b"." in self._curtoken: + return (self._curtokenpos, float(self._curtoken)) + else: + return (self._curtokenpos, int(self._curtoken)) + if m.lastgroup == "startdict": + return (self._curtokenpos, KEYWORD_DICT_BEGIN) + if m.lastgroup == "enddict": + return (self._curtokenpos, KEYWORD_DICT_END) + if m.lastgroup == "startstr": + return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end()) + if m.lastgroup == "hexstr": + self._curtoken = SPC.sub(b"", self._curtoken[1:-1]) + if len(self._curtoken) % 2 == 1: + self._curtoken += b"0" + return (self._curtokenpos, unhexlify(self._curtoken)) + # Anything else is treated as a keyword (whether explicitly matched or not) + if self._curtoken == b"true": + return (self._curtokenpos, True) + elif self._curtoken == b"false": + return (self._curtokenpos, False) + else: + return (self._curtokenpos, KWD(self._curtoken)) + + def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]: + """Parse the remainder of a string.""" + parts = [start] + paren = 1 + for m in STRLEXER.finditer(self.data, pos): + self.pos = m.end() + if m.lastgroup == "parenright": + paren -= 1 + if paren == 0: + # By far the most common situation! + break + parts.append(m[0]) + elif m.lastgroup == "parenleft": + parts.append(m[0]) + paren += 1 + elif m.lastgroup == "escape": + chr = m[0][1:2] + if chr not in ESC_STRING: + log.warning("Unrecognized escape %r", m[0]) + parts.append(chr) + else: + parts.append(bytes((ESC_STRING[chr],))) + elif m.lastgroup == "octal": + chrcode = int(m[0][1:], 8) + if chrcode >= 256: + # PDF1.7 p.16: "high-order overflow shall be + # ignored." + log.warning("Invalid octal %r (%d)", m[0][1:], chrcode) + else: + parts.append(bytes((chrcode,))) + elif m.lastgroup == "linebreak": + pass + else: + parts.append(m[0]) + if paren != 0: + log.warning("Unterminated string at %d", pos) + raise StopIteration + return (self._curtokenpos, b"".join(EOLR.sub(b"\n", part) for part in parts)) + + # Stack slots may by occupied by any of: # * the name of a literal # * the PSBaseParserToken types @@ -520,9 +799,17 @@ def _parse_hexstring(self): PSStackEntry = Tuple[int, PSStackType[ExtraT]] -class PSStackParser(PSBaseParser, Generic[ExtraT]): - def __init__(self, reader: BinaryIO) -> None: - PSBaseParser.__init__(self, reader) +class PSStackParser(Generic[ExtraT]): + def __init__(self, reader: Union[BinaryIO, bytes]) -> None: + self.reinit(reader) + + def reinit(self, reader: Union[BinaryIO, bytes]) -> None: + if isinstance(reader, bytes): + self._parser: Union[PSInMemoryParser, PSFileParser] = PSInMemoryParser( + reader + ) + else: + self._parser = PSFileParser(reader) self.reset() def reset(self) -> None: @@ -532,7 +819,7 @@ def reset(self) -> None: self.results: List[PSStackEntry[ExtraT]] = [] def seek(self, pos: int) -> None: - PSBaseParser.seek(self, pos) + self._parser.seek(pos) self.reset() def push(self, *objs: PSStackEntry[ExtraT]) -> None: @@ -571,6 +858,9 @@ def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: def do_keyword(self, pos: int, token: PSKeyword) -> None: pass + def flush(self) -> None: + pass + def nextobject(self) -> PSStackEntry[ExtraT]: """Yields a list of objects. @@ -580,7 +870,7 @@ def nextobject(self) -> PSStackEntry[ExtraT]: :return: keywords, literals, strings, numbers, arrays and dictionaries. """ while not self.results: - (pos, token) = self.nexttoken() + (pos, token) = self._parser.nexttoken() if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): # normal token self.push((pos, token)) @@ -643,10 +933,32 @@ def nextobject(self) -> PSStackEntry[ExtraT]: if self.context: continue else: - self.flush() # FIXME: what does it do? + self.flush() # Does nothing here, but in subclasses... (ugh) obj = self.results.pop(0) try: log.debug("nextobject: %r", obj) except Exception: log.debug("nextobject: (unprintable object)") return obj + + # Delegation follows + def nextline(self) -> Tuple[int, bytes]: + return self._parser.nextline() + + def revreadlines(self) -> Iterator[bytes]: + return self._parser.revreadlines() + + def read(self, pos: int, objlen: int) -> bytes: + return self._parser.read(pos, objlen) + + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + return self._parser.nexttoken() + + def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]: + return self._parser.get_inline_data(target) + + def __iter__(self): + return self + + def __next__(self): + return next(self._parser) diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py index a4ba03c0..cb84a20f 100644 --- a/tests/benchmark_parser.py +++ b/tests/benchmark_parser.py @@ -271,27 +271,62 @@ """ +def bench_bytes(): + from playa.psparser import PSInMemoryParser + + runs = 100 + start = time.time() + parser = PSInMemoryParser(DATA * runs) + _ = list(parser) + print( + "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000), + ) + + +def bench_bytesio(): + from pdfminer.psparser import PSEOF, PSBaseParser + + runs = 100 + start = time.time() + parser = PSBaseParser(BytesIO(DATA * runs)) + while True: + try: + _ = parser.nexttoken() + except PSEOF: + break + print( + "pdfminer.six Parser (BytesIO): %fms / run" + % ((time.time() - start) / runs * 1000), + ) + + def bench_playa(): from playa.converter import PDFPageAggregator from playa.pdfdocument import PDFDocument from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager from playa.pdfpage import PDFPage - from playa.psparser import PSBaseParser + from playa.psparser import PSFileParser, PSInMemoryParser runs = 100 start = time.time() - parser = PSBaseParser(BytesIO(DATA * runs)) + parser = PSFileParser(BytesIO(DATA * runs)) _ = list(parser) print( "PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000), ) + start = time.time() + parser = PSInMemoryParser(DATA * runs) + _ = list(parser) + print( + "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000), + ) with tempfile.NamedTemporaryFile() as tf: runs = 100 with open(tf.name, "wb") as outfh: outfh.write(DATA * runs) with open(tf.name, "rb") as infh: start = time.time() - parser = PSBaseParser(infh) + parser = PSFileParser(infh) _ = list(parser) print( "PLAYA Parser (BinaryIO): %fms / run" diff --git a/tests/test_open.py b/tests/test_open.py index df57513f..163e0c93 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -34,7 +34,6 @@ def test_open(path: Path): for password in passwords: with playa.open(TESTDIR / path, password=password) as pdf: pass - assert pdf.parser.fp.closed assert pdf.parser.doc is None @@ -51,5 +50,16 @@ def test_inline_data(): interp.process_page(page) +def test_multiple_contents(): + # See above... + with playa.open(TESTDIR / "jo.pdf") as doc: + rsrc = PDFResourceManager() + agg = PDFPageAggregator(rsrc, pageno=1) + interp = PDFPageInterpreter(rsrc, agg) + page = next(PDFPage.create_pages(doc)) + assert len(page.contents) > 1 + interp.process_page(page) + + if __name__ == "__main__": test_open(TESTDIR / "simple5.pdf") diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index b4d9df97..109b5536 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -2,16 +2,16 @@ Test the PDF parser """ -from io import BytesIO +import tempfile from playa.exceptions import PSEOF -from playa.pdfinterp import get_inline_data from playa.psparser import ( KEYWORD_DICT_BEGIN, KEYWORD_DICT_END, KWD, LIT, - PSBaseParser, + PSFileParser, + PSInMemoryParser, ) TESTDATA = b""" @@ -31,24 +31,54 @@ ] -def test_nextline(): - """Verify that we replicate the old nextline method.""" - parser = PSBaseParser(BytesIO(TESTDATA)) - lines = [] +def run_parsers(data: bytes, expected: list, makefunc): + """Test stuff on both BytesIO and BinaryIO.""" + bp = PSInMemoryParser(data) + output = [] + func = makefunc(bp) while True: try: - linepos, line = parser.nextline() + output.append(func()) except PSEOF: break - lines.append((linepos, line)) - assert lines == EXPECTED + assert output == expected + with tempfile.NamedTemporaryFile() as tf: + with open(tf.name, "wb") as outfh: + outfh.write(data) + with open(tf.name, "rb") as infh: + fp = PSFileParser(infh) + func = makefunc(fp) + output = [] + while True: + try: + output.append(func()) + except PSEOF: + break + assert output == expected + + +def test_nextline(): + """Verify that we replicate the old nextline method.""" + run_parsers(TESTDATA, EXPECTED, lambda foo: foo.nextline) def test_revreadlines(): """Verify that we replicate the old revreadlines method.""" - parser = PSBaseParser(BytesIO(TESTDATA)) - lines = list(parser.revreadlines()) - assert lines == list(reversed([line for pos, line in EXPECTED])) + expected = list(reversed([line for pos, line in EXPECTED])) + + def make_next(parser): + itor = parser.revreadlines() + + def nextor(): + try: + line = next(itor) + except StopIteration: + raise PSEOF + return line + + return nextor + + run_parsers(TESTDATA, expected, make_next) SIMPLE1 = b"""1 0 obj @@ -79,18 +109,33 @@ def test_revreadlines(): ] +def list_parsers(data: bytes, expected: list, discard_pos=False): + bp = PSInMemoryParser(data) + if discard_pos: + tokens = [tok for pos, tok in list(bp)] + else: + tokens = list(bp) + assert tokens == expected + with tempfile.NamedTemporaryFile() as tf: + with open(tf.name, "wb") as outfh: + outfh.write(data) + with open(tf.name, "rb") as infh: + fp = PSFileParser(infh) + if discard_pos: + tokens = [tok for pos, tok in list(fp)] + else: + tokens = list(fp) + assert tokens == expected + + def test_new_parser(): # Do a lot of them to make sure buffering works correctly - parser = PSBaseParser(BytesIO(SIMPLE1 * 100)) - tokens = [tok for pos, tok in list(parser)] - assert tokens == SIMPLETOK * 100 + list_parsers(SIMPLE1 * 100, SIMPLETOK * 100, discard_pos=True) def test_new_parser_eof(): # Make sure we get a keyword at eof - parser = PSBaseParser(BytesIO(SIMPLE1[:-1])) - tokens = [tok for pos, tok in list(parser)] - assert tokens == SIMPLETOK + list_parsers(SIMPLE1[:-1], SIMPLETOK, discard_pos=True) PAGE17 = b""" @@ -101,105 +146,104 @@ def test_new_parser_eof(): def test_new_parser1(): - parser = PSBaseParser(BytesIO(b"123.456")) - assert list(parser) == [(0, 123.456)] - parser = PSBaseParser(BytesIO(b"+.013")) - assert list(parser) == [(0, 0.013)] - parser = PSBaseParser(BytesIO(b"123")) - assert list(parser) == [(0, 123)] - parser = PSBaseParser(BytesIO(b"true false")) - assert list(parser) == [(0, True), (5, False)] - parser = PSBaseParser(BytesIO(b"(foobie bletch)")) - assert list(parser) == [(0, b"foobie bletch")] - parser = PSBaseParser(BytesIO(b"(foo")) # Invalid string - assert list(parser) == [] + list_parsers(b"123.456", [(0, 123.456)]) + list_parsers(b"+.013", [(0, 0.013)]) + list_parsers(b"123", [(0, 123)]) + list_parsers(b"true false", [(0, True), (5, False)]) + list_parsers(b"(foobie bletch)", [(0, b"foobie bletch")]) + list_parsers(b"(foo", []) def test_new_parser_names(): # Examples from PDF 1.7 page 17 - parser = PSBaseParser(BytesIO(PAGE17)) - tokens = list(parser) - assert tokens == [ - (5, LIT("A;Name_With-Various***Characters?")), - (44, LIT("lime Green")), - (62, LIT("paired()parentheses")), - ] + list_parsers( + PAGE17, + [ + (5, LIT("A;Name_With-Various***Characters?")), + (44, LIT("lime Green")), + (62, LIT("paired()parentheses")), + ], + ) def test_new_parser_strings(): - parser = PSBaseParser( - BytesIO( - rb"( Strings may contain balanced parentheses ( ) and " - rb"special characters ( * ! & } ^ % and so on ) . )" - ) + list_parsers( + rb"( Strings may contain balanced parentheses ( ) and " + rb"special characters ( * ! & } ^ % and so on ) . )", + [ + ( + 0, + rb" Strings may contain balanced parentheses ( ) and " + rb"special characters ( * ! & } ^ % and so on ) . ", + ) + ], ) - assert list(parser) == [ - ( - 0, - rb" Strings may contain balanced parentheses ( ) and " - rb"special characters ( * ! & } ^ % and so on ) . ", - ) - ] - parser = PSBaseParser(BytesIO(b"()")) - assert list(parser) == [(0, b"")] - parser = PSBaseParser( - BytesIO( - rb"""( These \ + list_parsers(b"()", [(0, b"")]) + list_parsers( + rb"""( These \ two strings \ are the same . ) - """ - ) + """, + [(0, b" These two strings are the same . ")], + ) + list_parsers(b"(foo\rbar)", [(0, b"foo\nbar")]) + list_parsers(b"(foo\r)", [(0, b"foo\n")]) + list_parsers(b"(foo\r\nbaz)", [(0, b"foo\nbaz")]) + list_parsers(b"(foo\n)", [(0, b"foo\n")]) + list_parsers( + rb"( This string contains \245two octal characters\307 . )", + [(0, b" This string contains \245two octal characters\307 . ")], ) - assert list(parser) == [(0, b" These two strings are the same . ")] - parser = PSBaseParser(BytesIO(b"(foo\rbar)")) - assert list(parser) == [(0, b"foo\nbar")] - parser = PSBaseParser(BytesIO(b"(foo\r)")) - assert list(parser) == [(0, b"foo\n")] - parser = PSBaseParser(BytesIO(b"(foo\r\nbaz)")) - assert list(parser) == [(0, b"foo\nbaz")] - parser = PSBaseParser(BytesIO(b"(foo\n)")) - assert list(parser) == [(0, b"foo\n")] - parser = PSBaseParser( - BytesIO(rb"( This string contains \245two octal characters\307 . )") + list_parsers(rb"(\0053 \053 \53)", [(0, b"\0053 \053 +")]) + list_parsers( + rb"< 4E6F762073686D6F7A206B6120706F702E >", [(0, b"Nov shmoz ka pop.")] ) - assert list(parser) == [ - (0, b" This string contains \245two octal characters\307 . ") - ] - parser = PSBaseParser(BytesIO(rb"(\0053 \053 \53)")) - assert list(parser) == [(0, b"\0053 \053 +")] - parser = PSBaseParser(BytesIO(rb"< 4E6F762073686D6F7A206B6120706F702E >")) - assert list(parser) == [(0, b"Nov shmoz ka pop.")] - parser = PSBaseParser(BytesIO(rb"<73 686 D6F7A2>")) - assert list(parser) == [(0, b"shmoz ")] - parser = PSBaseParser(BytesIO(rb"(\400)")) - assert list(parser) == [(0, b"")] + list_parsers(rb"<73 686 D6F7A2>", [(0, b"shmoz ")]) + list_parsers(rb"(\400)", [(0, b"")]) def test_invalid_strings_eof(): - parser = PSBaseParser(BytesIO(rb"(\00")) - assert list(parser) == [] - parser = PSBaseParser(BytesIO(rb"(abracadab")) - assert list(parser) == [] - parser = PSBaseParser(BytesIO(rb"<73686")) - assert list(parser) == [] + list_parsers(rb"(\00", []) + list_parsers(rb"(abracadab", []) + + +def inline_parsers( + data: bytes, expected: tuple, target=b"EI", nexttoken=None, blocksize=16 +): + bp = PSInMemoryParser(data) + assert bp.get_inline_data(target=target, blocksize=blocksize) == expected + if nexttoken is not None: + assert bp.nexttoken() == nexttoken + with tempfile.NamedTemporaryFile() as tf: + with open(tf.name, "wb") as outfh: + outfh.write(data) + with open(tf.name, "rb") as infh: + fp = PSFileParser(infh) + assert fp.get_inline_data(target=target, blocksize=blocksize) == expected + if nexttoken is not None: + assert fp.nexttoken() == nexttoken def test_get_inline_data(): - fp = BytesIO(b"""0123456789EI""") - assert get_inline_data(fp) == (10, b"0123456789EI") - fp = BytesIO(b"""0123456789EIEIO""") - assert get_inline_data(fp) == (10, b"0123456789EI") - assert fp.read(3) == b"EIO" - fp = BytesIO(b"""012EIEIO""") - assert get_inline_data(fp, blocksize=4) == (3, b"012EI") - assert fp.read(3) == b"EIO" - fp = BytesIO(b"""0123012EIEIO""") - assert get_inline_data(fp, blocksize=4) == (7, b"0123012EI") - assert fp.read(3) == b"EIO" + kwd_eio = KWD(b"EIO") + kwd_omg = KWD(b"OMG") + inline_parsers(b"""0123456789""", (-1, b"")) + inline_parsers(b"""0123456789EI""", (10, b"0123456789EI")) + inline_parsers( + b"""0123456789EIEIO""", (10, b"0123456789EI"), nexttoken=(12, kwd_eio) + ) + inline_parsers(b"""012EIEIO""", (3, b"012EI"), nexttoken=(5, kwd_eio), blocksize=4) + inline_parsers( + b"""0123012EIEIO""", (7, b"0123012EI"), nexttoken=(9, kwd_eio), blocksize=4 + ) for blocksize in range(1, 8): - fp = BytesIO(b"""012EIEIOOMG""") - assert get_inline_data(fp, blocksize=blocksize, target=b"EIEIO") == ( - 3, - b"012EIEIO", + inline_parsers( + b"""012EIEIOOMG""", + ( + 3, + b"012EIEIO", + ), + target=b"EIEIO", + nexttoken=(8, kwd_omg), + blocksize=blocksize, ) - assert fp.read(3) == b"OMG"