Skip to content

Commit

Permalink
Use mmap when possible to parse and fix various small bugs (#7)
Browse files Browse the repository at this point in the history
* fix: embarrasing confusion of min/max

* feat: use mmap + PSInMemoryParser when possible

* fix: correct (nonsense) CRLF handling in in-memory parser
  • Loading branch information
dhdaines authored Sep 19, 2024
1 parent 9c7768f commit 025fa8a
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 26 deletions.
9 changes: 5 additions & 4 deletions playa/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,17 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
except KeyError:
if settings.STRICT:
raise PDFSyntaxError("/Length is undefined: %r" % dic)
# back up and read the entire line including 'stream' as
# the data starts after the trailing newline
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
(_, line) = self.nextline() # 'stream\n'
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError("Unexpected EOF")
return
pos += len(line)
data = bytearray(self.read(pos, objlen))
self.seek(pos + objlen)
pos = self.tell()
data = self.read(objlen)
while True:
try:
(linepos, line) = self.nextline()
Expand Down
45 changes: 31 additions & 14 deletions playa/psparser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import io
import logging
import mmap
import re
from binascii import unhexlify
from collections import deque
Expand Down Expand Up @@ -188,10 +189,9 @@ def tell(self) -> int:
"""Get the current position in the file."""
return self.fp.tell()

def read(self, pos: int, objlen: int) -> bytes:
def read(self, objlen: int) -> bytes:
"""Read data from a specified position, moving the current
position to the end of this data."""
self.fp.seek(pos)
return self.fp.read(objlen)

def nextline(self) -> Tuple[int, bytes]:
Expand Down Expand Up @@ -610,6 +610,7 @@ def _parse_hexstring(self) -> bytes:
| (?P<escape> \\.)
| (?P<parenleft> \()
| (?P<parenright> \))
| (?P<newline> \r\n?|\n)
| (?P<other> .)
)""",
re.VERBOSE,
Expand All @@ -624,7 +625,7 @@ class PSInMemoryParser:
Parser for in-memory data streams.
"""

def __init__(self, data: bytes) -> None:
def __init__(self, data: Union[bytes, mmap.mmap]) -> None:
self.data = data
self.pos = 0
self.end = len(data)
Expand All @@ -646,10 +647,11 @@ def tell(self) -> int:
"""Get the current position in the buffer."""
return self.pos

def read(self, pos: int, objlen: int) -> bytes:
"""Read data from a specified position, moving the current
position to the end of this data."""
self.pos = max(pos + objlen, len(self.data))
def read(self, objlen: int) -> bytes:
"""Read data from current position, advancing to the end of
this data."""
pos = self.pos
self.pos = min(pos + objlen, len(self.data))
return self.data[pos : self.pos]

def nextline(self) -> Tuple[int, bytes]:
Expand All @@ -673,8 +675,8 @@ def revreadlines(self) -> Iterator[bytes]:
"""
endline = pos = self.end
while True:
nidx = self.data.rfind(ord(b"\n"), 0, pos)
ridx = self.data.rfind(ord(b"\r"), 0, pos)
nidx = self.data.rfind(b"\n", 0, pos)
ridx = self.data.rfind(b"\r", 0, pos)
best = max(nidx, ridx)
if best == -1:
yield self.data[:endline]
Expand Down Expand Up @@ -767,7 +769,8 @@ def __next__(self) -> Tuple[int, PSBaseParserToken]:

def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]:
"""Parse the remainder of a string."""
parts = [start]
# Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
parts = [EOLR.sub(b"\n", start)]
paren = 1
for m in STRLEXER.finditer(self.data, pos):
self.pos = m.end()
Expand Down Expand Up @@ -795,14 +798,17 @@ def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]
log.warning("Invalid octal %r (%d)", m[0][1:], chrcode)
else:
parts.append(bytes((chrcode,)))
elif m.lastgroup == "newline": # type: ignore
# Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
parts.append(b"\n")
elif m.lastgroup == "linebreak": # type: ignore
pass
else:
parts.append(m[0])
if paren != 0:
log.warning("Unterminated string at %d", pos)
raise StopIteration
return (self._curtokenpos, b"".join(EOLR.sub(b"\n", part) for part in parts))
return (self._curtokenpos, b"".join(parts))


# Stack slots may by occupied by any of:
Expand Down Expand Up @@ -830,7 +836,14 @@ def reinit(self, reader: Union[BinaryIO, bytes]) -> None:
reader
)
else:
self._parser = PSFileParser(reader)
try:
self._mmap = mmap.mmap(reader.fileno(), 0, access=mmap.ACCESS_READ)
self._parser = PSInMemoryParser(self._mmap)
except io.UnsupportedOperation:
log.warning(
"mmap not supported on %r, falling back to file parser", reader
)
self._parser = PSFileParser(reader)
self.reset()

def reset(self) -> None:
Expand All @@ -845,6 +858,10 @@ def seek(self, pos: int) -> None:
self._parser.seek(pos)
self.reset()

def tell(self) -> int:
"""Get the current position in the file."""
return self._parser.tell()

def push(self, *objs: PSStackEntry[ExtraT]) -> None:
"""Push some objects onto the stack."""
self.curstack.extend(objs)
Expand Down Expand Up @@ -985,10 +1002,10 @@ def revreadlines(self) -> Iterator[bytes]:
"""
return self._parser.revreadlines()

def read(self, pos: int, objlen: int) -> bytes:
def read(self, objlen: int) -> bytes:
"""Read data from a specified position, moving the current
position to the end of this data."""
return self._parser.read(pos, objlen)
return self._parser.read(objlen)

def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
"""Get the next token in iteration, raising PSEOF when done."""
Expand Down
36 changes: 29 additions & 7 deletions tests/benchmark_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,26 @@ def bench_bytes():
)


def bench_mmap():
import mmap

from playa.psparser import PSInMemoryParser

with tempfile.NamedTemporaryFile() as tf:
runs = 100
with open(tf.name, "wb") as outfh:
outfh.write(DATA * runs)
with open(tf.name, "rb") as infh:
start = time.time()
mapping = mmap.mmap(infh.fileno(), 0, access=mmap.ACCESS_READ)
parser = PSInMemoryParser(mapping)
_ = list(parser)
print(
"PLAYA Parser (mmap): %fms / run"
% ((time.time() - start) / runs * 1000),
)


def bench_bytesio():
from pdfminer.psparser import PSEOF, PSBaseParser

Expand All @@ -305,7 +325,7 @@ def bench_playa():
from playa.pdfdocument import PDFDocument
from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager
from playa.pdfpage import PDFPage
from playa.psparser import PSFileParser, PSInMemoryParser
from playa.psparser import PSFileParser

runs = 100
start = time.time()
Expand All @@ -314,12 +334,6 @@ def bench_playa():
print(
"PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000),
)
start = time.time()
parser = PSInMemoryParser(DATA * runs)
_ = list(parser)
print(
"PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000),
)
with tempfile.NamedTemporaryFile() as tf:
runs = 100
with open(tf.name, "wb") as outfh:
Expand All @@ -332,6 +346,8 @@ def bench_playa():
"PLAYA Parser (BinaryIO): %fms / run"
% ((time.time() - start) / runs * 1000),
)
bench_bytes()
bench_mmap()

runs = 20
start = time.time()
Expand Down Expand Up @@ -405,3 +421,9 @@ def bench_pdfminer():
bench_pdfminer()
if len(sys.argv) < 2 or sys.argv[1] == "playa":
bench_playa()
if len(sys.argv) > 1 and sys.argv[1] == "bytes":
bench_bytes()
if len(sys.argv) > 1 and sys.argv[1] == "bytesio":
bench_bytesio()
if len(sys.argv) > 1 and sys.argv[1] == "mmap":
bench_mmap()
4 changes: 3 additions & 1 deletion tests/test_pdfminer_psparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,8 +330,10 @@ def test_new_parser_strings() -> None:
)
list_parsers(b"(foo\rbar)", [(0, b"foo\nbar")])
list_parsers(b"(foo\r)", [(0, b"foo\n")])
list_parsers(b"(foo\r\nbaz)", [(0, b"foo\nbaz")])
list_parsers(b"(foo\r\nbar\r\nbaz)", [(0, b"foo\nbar\nbaz")])
list_parsers(b"(foo\n)", [(0, b"foo\n")])
list_parsers(br"(foo\r\nbaz)", [(0, b"foo\r\nbaz")])
list_parsers(br"(foo\r\nbar\r\nbaz)", [(0, b"foo\r\nbar\r\nbaz")])
list_parsers(
rb"( This string contains \245two octal characters\307 . )",
[(0, b" This string contains \245two octal characters\307 . ")],
Expand Down

0 comments on commit 025fa8a

Please sign in to comment.