diff --git a/playa/pdfparser.py b/playa/pdfparser.py index d5057238..efd2a024 100644 --- a/playa/pdfparser.py +++ b/playa/pdfparser.py @@ -80,16 +80,17 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: except KeyError: if settings.STRICT: raise PDFSyntaxError("/Length is undefined: %r" % dic) + # back up and read the entire line including 'stream' as + # the data starts after the trailing newline self.seek(pos) try: - (_, line) = self.nextline() # 'stream' + (_, line) = self.nextline() # 'stream\n' except PSEOF: if settings.STRICT: raise PDFSyntaxError("Unexpected EOF") return - pos += len(line) - data = bytearray(self.read(pos, objlen)) - self.seek(pos + objlen) + pos = self.tell() + data = self.read(objlen) while True: try: (linepos, line) = self.nextline() diff --git a/playa/psparser.py b/playa/psparser.py index 4a38e680..b1f142eb 100755 --- a/playa/psparser.py +++ b/playa/psparser.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import io import logging +import mmap import re from binascii import unhexlify from collections import deque @@ -188,10 +189,9 @@ def tell(self) -> int: """Get the current position in the file.""" return self.fp.tell() - def read(self, pos: int, objlen: int) -> bytes: + def read(self, objlen: int) -> bytes: """Read data from a specified position, moving the current position to the end of this data.""" - self.fp.seek(pos) return self.fp.read(objlen) def nextline(self) -> Tuple[int, bytes]: @@ -610,6 +610,7 @@ def _parse_hexstring(self) -> bytes: | (?P \\.) | (?P \() | (?P \)) + | (?P \r\n?|\n) | (?P .) )""", re.VERBOSE, @@ -624,7 +625,7 @@ class PSInMemoryParser: Parser for in-memory data streams. """ - def __init__(self, data: bytes) -> None: + def __init__(self, data: Union[bytes, mmap.mmap]) -> None: self.data = data self.pos = 0 self.end = len(data) @@ -646,10 +647,11 @@ def tell(self) -> int: """Get the current position in the buffer.""" return self.pos - def read(self, pos: int, objlen: int) -> bytes: - """Read data from a specified position, moving the current - position to the end of this data.""" - self.pos = max(pos + objlen, len(self.data)) + def read(self, objlen: int) -> bytes: + """Read data from current position, advancing to the end of + this data.""" + pos = self.pos + self.pos = min(pos + objlen, len(self.data)) return self.data[pos : self.pos] def nextline(self) -> Tuple[int, bytes]: @@ -673,8 +675,8 @@ def revreadlines(self) -> Iterator[bytes]: """ endline = pos = self.end while True: - nidx = self.data.rfind(ord(b"\n"), 0, pos) - ridx = self.data.rfind(ord(b"\r"), 0, pos) + nidx = self.data.rfind(b"\n", 0, pos) + ridx = self.data.rfind(b"\r", 0, pos) best = max(nidx, ridx) if best == -1: yield self.data[:endline] @@ -767,7 +769,8 @@ def __next__(self) -> Tuple[int, PSBaseParserToken]: def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]: """Parse the remainder of a string.""" - parts = [start] + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts = [EOLR.sub(b"\n", start)] paren = 1 for m in STRLEXER.finditer(self.data, pos): self.pos = m.end() @@ -795,6 +798,9 @@ def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken] log.warning("Invalid octal %r (%d)", m[0][1:], chrcode) else: parts.append(bytes((chrcode,))) + elif m.lastgroup == "newline": # type: ignore + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts.append(b"\n") elif m.lastgroup == "linebreak": # type: ignore pass else: @@ -802,7 +808,7 @@ def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken] if paren != 0: log.warning("Unterminated string at %d", pos) raise StopIteration - return (self._curtokenpos, b"".join(EOLR.sub(b"\n", part) for part in parts)) + return (self._curtokenpos, b"".join(parts)) # Stack slots may by occupied by any of: @@ -830,7 +836,14 @@ def reinit(self, reader: Union[BinaryIO, bytes]) -> None: reader ) else: - self._parser = PSFileParser(reader) + try: + self._mmap = mmap.mmap(reader.fileno(), 0, access=mmap.ACCESS_READ) + self._parser = PSInMemoryParser(self._mmap) + except io.UnsupportedOperation: + log.warning( + "mmap not supported on %r, falling back to file parser", reader + ) + self._parser = PSFileParser(reader) self.reset() def reset(self) -> None: @@ -845,6 +858,10 @@ def seek(self, pos: int) -> None: self._parser.seek(pos) self.reset() + def tell(self) -> int: + """Get the current position in the file.""" + return self._parser.tell() + def push(self, *objs: PSStackEntry[ExtraT]) -> None: """Push some objects onto the stack.""" self.curstack.extend(objs) @@ -985,10 +1002,10 @@ def revreadlines(self) -> Iterator[bytes]: """ return self._parser.revreadlines() - def read(self, pos: int, objlen: int) -> bytes: + def read(self, objlen: int) -> bytes: """Read data from a specified position, moving the current position to the end of this data.""" - return self._parser.read(pos, objlen) + return self._parser.read(objlen) def nexttoken(self) -> Tuple[int, PSBaseParserToken]: """Get the next token in iteration, raising PSEOF when done.""" diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py index cb84a20f..acbbf999 100644 --- a/tests/benchmark_parser.py +++ b/tests/benchmark_parser.py @@ -283,6 +283,26 @@ def bench_bytes(): ) +def bench_mmap(): + import mmap + + from playa.psparser import PSInMemoryParser + + with tempfile.NamedTemporaryFile() as tf: + runs = 100 + with open(tf.name, "wb") as outfh: + outfh.write(DATA * runs) + with open(tf.name, "rb") as infh: + start = time.time() + mapping = mmap.mmap(infh.fileno(), 0, access=mmap.ACCESS_READ) + parser = PSInMemoryParser(mapping) + _ = list(parser) + print( + "PLAYA Parser (mmap): %fms / run" + % ((time.time() - start) / runs * 1000), + ) + + def bench_bytesio(): from pdfminer.psparser import PSEOF, PSBaseParser @@ -305,7 +325,7 @@ def bench_playa(): from playa.pdfdocument import PDFDocument from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager from playa.pdfpage import PDFPage - from playa.psparser import PSFileParser, PSInMemoryParser + from playa.psparser import PSFileParser runs = 100 start = time.time() @@ -314,12 +334,6 @@ def bench_playa(): print( "PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000), ) - start = time.time() - parser = PSInMemoryParser(DATA * runs) - _ = list(parser) - print( - "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000), - ) with tempfile.NamedTemporaryFile() as tf: runs = 100 with open(tf.name, "wb") as outfh: @@ -332,6 +346,8 @@ def bench_playa(): "PLAYA Parser (BinaryIO): %fms / run" % ((time.time() - start) / runs * 1000), ) + bench_bytes() + bench_mmap() runs = 20 start = time.time() @@ -405,3 +421,9 @@ def bench_pdfminer(): bench_pdfminer() if len(sys.argv) < 2 or sys.argv[1] == "playa": bench_playa() + if len(sys.argv) > 1 and sys.argv[1] == "bytes": + bench_bytes() + if len(sys.argv) > 1 and sys.argv[1] == "bytesio": + bench_bytesio() + if len(sys.argv) > 1 and sys.argv[1] == "mmap": + bench_mmap() diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py index b5786792..8d25112b 100644 --- a/tests/test_pdfminer_psparser.py +++ b/tests/test_pdfminer_psparser.py @@ -330,8 +330,10 @@ def test_new_parser_strings() -> None: ) list_parsers(b"(foo\rbar)", [(0, b"foo\nbar")]) list_parsers(b"(foo\r)", [(0, b"foo\n")]) - list_parsers(b"(foo\r\nbaz)", [(0, b"foo\nbaz")]) + list_parsers(b"(foo\r\nbar\r\nbaz)", [(0, b"foo\nbar\nbaz")]) list_parsers(b"(foo\n)", [(0, b"foo\n")]) + list_parsers(br"(foo\r\nbaz)", [(0, b"foo\r\nbaz")]) + list_parsers(br"(foo\r\nbar\r\nbaz)", [(0, b"foo\r\nbar\r\nbaz")]) list_parsers( rb"( This string contains \245two octal characters\307 . )", [(0, b" This string contains \245two octal characters\307 . ")],