From 37f3b2e183091ebaddceb4dd051bbc98c022c70d Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Wed, 18 Sep 2024 22:30:25 -0400
Subject: [PATCH] Implement in-memory parsing (#6)

* refactor: put get_inline_data where it can be overridden

* fix: handle case where eos not found

* feat: in-memory lexer which is Very Fast

* test: fully test both parsers

* fix: make it work again

* fix: eliminate inheritance

* feat: accept bytes directly

* chore: format

* fix: simplify and fix PDFContentParser

* feat: implement switch between file/memory parsers
---
 playa/cmapdb.py           |   5 +-
 playa/pdffont.py          |  10 +-
 playa/pdfinterp.py        | 122 +++++---------
 playa/pdfparser.py        |  10 +-
 playa/psparser.py         | 334 ++++++++++++++++++++++++++++++++++++--
 tests/benchmark_parser.py |  41 ++++-
 tests/test_open.py        |  12 +-
 tests/test_pdfparser.py   | 246 ++++++++++++++++------------
 8 files changed, 569 insertions(+), 211 deletions(-)

diff --git a/playa/cmapdb.py b/playa/cmapdb.py
index 43e7708f..bde0dde3 100644
--- a/playa/cmapdb.py
+++ b/playa/cmapdb.py
@@ -18,7 +18,6 @@
 import sys
 from typing import (
     Any,
-    BinaryIO,
     Dict,
     Iterable,
     Iterator,
@@ -277,8 +276,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
 
 
 class CMapParser(PSStackParser[PSKeyword]):
-    def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
-        PSStackParser.__init__(self, fp)
+    def __init__(self, cmap: CMapBase, data: bytes) -> None:
+        super().__init__(data)
         self.cmap = cmap
         # some ToUnicode maps don't have "begincmap" keyword.
         self._in_cmap = True
diff --git a/playa/pdffont.py b/playa/pdffont.py
index 2399e129..5ad3b329 100644
--- a/playa/pdffont.py
+++ b/playa/pdffont.py
@@ -114,8 +114,8 @@ class Type1FontHeaderParser(PSStackParser[int]):
     KEYWORD_READONLY = KWD(b"readonly")
     KEYWORD_FOR = KWD(b"for")
 
-    def __init__(self, data: BinaryIO) -> None:
-        PSStackParser.__init__(self, data)
+    def __init__(self, data: bytes) -> None:
+        super().__init__(data)
         self._cid2unicode: Dict[int, str] = {}
 
     def get_encoding(self) -> Dict[int, str]:
@@ -968,7 +968,7 @@ def __init__(
         if "ToUnicode" in spec:
             strm = stream_value(spec["ToUnicode"])
             self.unicode_map = FileUnicodeMap()
-            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+            CMapParser(self.unicode_map, strm.get_data()).run()
         PDFFont.__init__(self, descriptor, widths)
 
     def to_unichr(self, cid: int) -> str:
@@ -1008,7 +1008,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No
             self.fontfile = stream_value(descriptor.get("FontFile"))
             length1 = int_value(self.fontfile["Length1"])
             data = self.fontfile.get_data()[:length1]
-            parser = Type1FontHeaderParser(BytesIO(data))
+            parser = Type1FontHeaderParser(data)
             self.cid2unicode = parser.get_encoding()
 
     def __repr__(self) -> str:
@@ -1079,7 +1079,7 @@ def __init__(
             if isinstance(spec["ToUnicode"], PDFStream):
                 strm = stream_value(spec["ToUnicode"])
                 self.unicode_map = FileUnicodeMap()
-                CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+                CMapParser(self.unicode_map, strm.get_data()).run()
             else:
                 cmap_name = literal_name(spec["ToUnicode"])
                 encoding = literal_name(spec["Encoding"])
diff --git a/playa/pdfinterp.py b/playa/pdfinterp.py
index 471092b7..52b19932 100644
--- a/playa/pdfinterp.py
+++ b/playa/pdfinterp.py
@@ -1,12 +1,16 @@
-import io
 import logging
-from io import BytesIO
-from typing import BinaryIO, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
+from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 from playa import settings
 from playa.casting import safe_float
 from playa.cmapdb import CMap, CMapBase, CMapDB
-from playa.exceptions import PSEOF, PDFException, PDFValueError, PSTypeError
+from playa.exceptions import (
+    PSEOF,
+    PDFException,
+    PDFSyntaxError,
+    PDFValueError,
+    PSTypeError,
+)
 from playa.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
 from playa.pdfdevice import PDFDevice, PDFTextSeq
 from playa.pdffont import (
@@ -30,6 +34,7 @@
 from playa.psparser import (
     KWD,
     LIT,
+    PSBaseParserToken,
     PSKeyword,
     PSLiteral,
     PSStackParser,
@@ -252,85 +257,38 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
 KEYWORD_EI = KWD(b"EI")
 
 
-def get_inline_data(
-    fp: BinaryIO, target: bytes = b"EI", blocksize: int = 4096
-) -> Tuple[int, bytes]:
-    """Get the data for an inline image up to the target
-    end-of-stream marker.
-
-    Returns a tuple of the position of the target in the data and the
-    data *including* the end of stream marker.  Advances the file
-    pointer to a position after the end of the stream.
-
-    The caller is responsible for removing the end-of-stream if
-    necessary (this depends on the filter being used) and parsing
-    the end-of-stream token (likewise) if necessary.
+class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
+    """Parse the concatenation of multiple content streams, as
+    described in the spec (PDF 1.7, p.86):
+
+    ...the effect shall be as if all of the streams in the array were
+    concatenated, in order, to form a single stream.  Conforming
+    writers can create image objects and other resources as they
+    occur, even though they interrupt the content stream. The division
+    between streams may occur only at the boundaries between lexical
+    tokens (see 7.2, "Lexical Conventions") but shall be unrelated to
+    the page’s logical content or organization.
     """
-    # PDF 1.7, p. 216: The bytes between the ID and EI operators
-    # shall be treated the same as a stream object’s data (see
-    # 7.3.8, "Stream Objects"), even though they do not follow the
-    # standard stream syntax.
-    data = []  # list of blocks
-    partial = b""  # partially seen target
-    pos = 0
-    while True:
-        # Did we see part of the target at the end of the last
-        # block?  Then scan ahead and try to find the rest (we
-        # assume the stream is buffered)
-        if partial:
-            extra_len = len(target) - len(partial)
-            extra = fp.read(extra_len)
-            if partial + extra == target:
-                pos -= len(partial)
-                data.append(extra)
-                break
-            # Put it back (assume buffering!)
-            fp.seek(-extra_len, io.SEEK_CUR)
-            partial = b""
-            # Fall through (the target could be at the beginning)
-        buf = fp.read(blocksize)
-        tpos = buf.find(target)
-        if tpos != -1:
-            data.append(buf[: tpos + len(target)])
-            # Put the extra back (assume buffering!)
-            fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR)
-            pos += tpos
-            break
-        else:
-            pos += len(buf)
-            # look for the longest partial match at the end
-            plen = len(target) - 1
-            while plen > 0:
-                ppos = len(buf) - plen
-                if buf[ppos:] == target[:plen]:
-                    partial = buf[ppos:]
-                    break
-                plen -= 1
-            data.append(buf)
-    return (pos, b"".join(data))
 
-
-class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
     def __init__(self, streams: Sequence[object]) -> None:
-        self.streams = streams
-        self.istream = 0
-        # PSStackParser.__init__(fp=None) is safe only because we've overloaded
-        # all the methods that would attempt to access self.fp without first
-        # calling self.fillfp().
-        PSStackParser.__init__(self, None)  # type: ignore[arg-type]
-
-    def fillfp(self) -> None:
-        if not self.fp:
-            if self.istream < len(self.streams):
-                strm = stream_value(self.streams[self.istream])
-                self.istream += 1
-            else:
-                raise PSEOF("Unexpected EOF, file truncated?")
-            self.fp = BytesIO(strm.get_data())
+        self.streamiter = iter(streams)
+        try:
+            stream = stream_value(next(self.streamiter))
+        except StopIteration:
+            raise PSEOF
+        log.debug("PDFContentParser starting stream %r", stream)
+        super().__init__(stream.get_data())
 
-    def seek(self, pos: int) -> None:
-        self.fillfp()
-        super().seek(pos)
+    def __next__(self) -> Tuple[int, PSBaseParserToken]:
+        while True:
+            try:
+                return super().__next__()
+            except StopIteration:
+                # Will also raise StopIteration if there are no more,
+                # which is exactly what we want
+                stream = stream_value(next(self.streamiter))
+                log.debug("PDFContentParser starting stream %r", stream)
+                self.reinit(stream.get_data())
 
     def flush(self) -> None:
         self.add_results(*self.popall())
@@ -360,7 +318,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                 # interpreted as the first byte of image data.
                 if eos == b"EI":
                     self.seek(pos + len(token.name) + 1)
-                    (pos, data) = get_inline_data(self.fp, target=eos)
+                    (pos, data) = self.get_inline_data(target=eos)
                     # FIXME: it is totally unspecified what to do with
                     # a newline between the end of the data and "EI",
                     # since there is no explicit stream length.  (PDF
@@ -371,7 +329,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                     data = data[: -len(eos)]
                 else:
                     self.seek(pos + len(token.name))
-                    (pos, data) = get_inline_data(self.fp, target=eos)
+                    (pos, data) = self.get_inline_data(target=eos)
+                if pos == -1:
+                    raise PDFSyntaxError("End of inline stream %r not found" % eos)
                 obj = PDFStream(d, data)
                 self.push((pos, obj))
                 # This was included in the data but we need to "parse" it
diff --git a/playa/pdfparser.py b/playa/pdfparser.py
index 3603c79f..d5057238 100644
--- a/playa/pdfparser.py
+++ b/playa/pdfparser.py
@@ -1,5 +1,4 @@
 import logging
-from io import BytesIO
 from typing import TYPE_CHECKING, BinaryIO, Optional, Union
 
 from playa import settings
@@ -40,8 +39,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
 
     """
 
-    def __init__(self, fp: BinaryIO) -> None:
-        PSStackParser.__init__(self, fp)
+    def __init__(self, data: Union[BinaryIO, bytes]) -> None:
+        super().__init__(data)
         self.doc: Optional[PDFDocument] = None
         self.fallback = False
 
@@ -89,8 +88,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                     raise PDFSyntaxError("Unexpected EOF")
                 return
             pos += len(line)
-            self.fp.seek(pos)
-            data = bytearray(self.fp.read(objlen))
+            data = bytearray(self.read(pos, objlen))
             self.seek(pos + objlen)
             while True:
                 try:
@@ -135,7 +133,7 @@ class PDFStreamParser(PDFParser):
     """
 
     def __init__(self, data: bytes) -> None:
-        PDFParser.__init__(self, BytesIO(data))
+        super().__init__(data)
 
     def flush(self) -> None:
         self.add_results(*self.popall())
diff --git a/playa/psparser.py b/playa/psparser.py
index 1eb6d990..1c4e9431 100755
--- a/playa/psparser.py
+++ b/playa/psparser.py
@@ -139,7 +139,6 @@ def keyword_name(x: Any) -> Any:
 
 
 EOL = b"\r\n"
-SPC = re.compile(rb"\s")
 WHITESPACE = b" \t\n\r\f\v"
 NUMBER = b"0123456789"
 HEX = NUMBER + b"abcdef" + b"ABCDEF"
@@ -162,14 +161,19 @@ def keyword_name(x: Any) -> Any:
 PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
 
 
-class PSBaseParser:
+class PSFileParser:
+    """
+    Parser (actually a lexer) for PDF data from a buffered file object.
+    """
+
     def __init__(self, fp: BinaryIO):
         self.fp = fp
         self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque()
         self.seek(0)
 
-    def flush(self) -> None:
-        pass
+    def reinit(self, fp: BinaryIO):
+        self.fp = fp
+        self.seek(0)
 
     def seek(self, pos: int) -> None:
         self.fp.seek(pos)
@@ -181,6 +185,10 @@ def seek(self, pos: int) -> None:
     def tell(self) -> int:
         return self.fp.tell()
 
+    def read(self, pos: int, objlen: int) -> bytes:
+        self.fp.seek(pos)
+        return self.fp.read(objlen)
+
     def nextline(self) -> Tuple[int, bytes]:
         r"""Fetches a next line that ends either with \r, \n, or \r\n."""
         linepos = self.fp.tell()
@@ -227,6 +235,65 @@ def revreadlines(self) -> Iterator[bytes]:
                 buf = c + buf
         yield buf
 
+    def get_inline_data(
+        self, target: bytes = b"EI", blocksize: int = 4096
+    ) -> Tuple[int, bytes]:
+        """Get the data for an inline image up to the target
+        end-of-stream marker.
+
+        Returns a tuple of the position of the target in the data and the
+        data *including* the end of stream marker.  Advances the file
+        pointer to a position after the end of the stream.
+
+        The caller is responsible for removing the end-of-stream if
+        necessary (this depends on the filter being used) and parsing
+        the end-of-stream token (likewise) if necessary.
+        """
+        # PDF 1.7, p. 216: The bytes between the ID and EI operators
+        # shall be treated the same as a stream object’s data (see
+        # 7.3.8, "Stream Objects"), even though they do not follow the
+        # standard stream syntax.
+        data = []  # list of blocks
+        partial = b""  # partially seen target
+        pos = 0
+        while True:
+            # Did we see part of the target at the end of the last
+            # block?  Then scan ahead and try to find the rest (we
+            # assume the stream is buffered)
+            if partial:
+                extra_len = len(target) - len(partial)
+                extra = self.fp.read(extra_len)
+                if partial + extra == target:
+                    pos -= len(partial)
+                    data.append(extra)
+                    break
+                # Put it back (assume buffering!)
+                self.fp.seek(-extra_len, io.SEEK_CUR)
+                partial = b""
+                # Fall through (the target could be at the beginning)
+            buf = self.fp.read(blocksize)
+            if not buf:
+                return (-1, b"")
+            tpos = buf.find(target)
+            if tpos != -1:
+                data.append(buf[: tpos + len(target)])
+                # Put the extra back (assume buffering!)
+                self.fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR)
+                pos += tpos
+                break
+            else:
+                pos += len(buf)
+                # look for the longest partial match at the end
+                plen = len(target) - 1
+                while plen > 0:
+                    ppos = len(buf) - plen
+                    if buf[ppos:] == target[:plen]:
+                        partial = buf[ppos:]
+                        break
+                    plen -= 1
+                data.append(buf)
+        return (pos, b"".join(data))
+
     def __iter__(self):
         return self
 
@@ -458,7 +525,7 @@ def _parse_string_octal(self):
             chrcode = int(self.oct, 8)
             if chrcode >= 256:
                 # PDF1.7 p.16: "high-order overflow shall be ignored."
-                log.warning("Invalid octal %s (%d)", repr(self.oct), chrcode)
+                log.warning("Invalid octal %r (%d)", self.oct, chrcode)
             else:
                 self._curtoken += bytes((chrcode,))
             # Back to normal string parsing
@@ -509,6 +576,218 @@ def _parse_hexstring(self):
         return c
 
 
+LEXER = re.compile(
+    rb"""(?:
+      (?P<whitespace> \s+)
+    | (?P<comment> %[^\r\n]*[\r\n])
+    | (?P<name> /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ )
+    | (?P<number> [-+]? (?: \d*\.\d+ | \d+ ) )
+    | (?P<keyword> [A-Za-z] [^#/%\[\]()<>{}\s]*)
+    | (?P<startstr> \([^()\\]*)
+    | (?P<hexstr> <[A-Fa-f\d\s]+>)
+    | (?P<startdict> <<)
+    | (?P<enddict> >>)
+    | (?P<other> .)
+)
+""",
+    re.VERBOSE,
+)
+STRLEXER = re.compile(
+    rb"""(?:
+      (?P<octal> \\\d{1,3})
+    | (?P<linebreak> \\(?:\r\n?|\n))
+    | (?P<escape> \\.)
+    | (?P<parenleft> \()
+    | (?P<parenright> \))
+    | (?P<other> .)
+)""",
+    re.VERBOSE,
+)
+HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])")
+EOLR = re.compile(rb"\r\n?|\n")
+SPC = re.compile(rb"\s")
+
+
+class PSInMemoryParser:
+    """
+    Parser for in-memory data streams.
+    """
+
+    def __init__(self, data: bytes):
+        self.data = data
+        self.pos = 0
+        self.end = len(data)
+        self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque()
+
+    def reinit(self, data: bytes):
+        self.data = data
+        self.seek(0)
+
+    def seek(self, pos: int) -> None:
+        self.pos = pos
+        self._curtoken = b""
+        self._curtokenpos = 0
+        self._tokens.clear()
+
+    def tell(self) -> int:
+        return self.pos
+
+    def read(self, pos: int, objlen: int) -> bytes:
+        self.pos = max(pos + objlen, len(self.data))
+        return self.data[pos : self.pos]
+
+    def nextline(self) -> Tuple[int, bytes]:
+        r"""Fetches a next line that ends either with \r, \n, or \r\n."""
+        if self.pos == self.end:
+            raise PSEOF
+        linepos = self.pos
+        m = EOLR.search(self.data, self.pos)
+        if m is None:
+            self.pos = self.end
+        else:
+            self.pos = m.end()
+        return (linepos, self.data[linepos : self.pos])
+
+    def revreadlines(self) -> Iterator[bytes]:
+        """Fetches a next line backwards.
+
+        This is used to locate the trailers at the end of a file.  So,
+        it isn't actually used in PSInMemoryParser, but is here for
+        completeness.
+        """
+        endline = pos = self.end
+        while True:
+            nidx = self.data.rfind(ord(b"\n"), 0, pos)
+            ridx = self.data.rfind(ord(b"\r"), 0, pos)
+            best = max(nidx, ridx)
+            if best == -1:
+                yield self.data[:endline]
+                break
+            yield self.data[best + 1 : endline]
+            endline = best + 1
+            pos = best
+            if pos > 0 and self.data[pos - 1 : pos + 1] == b"\r\n":
+                pos -= 1
+
+    def get_inline_data(
+        self, target: bytes = b"EI", blocksize: int = -1
+    ) -> Tuple[int, bytes]:
+        """Get the data for an inline image up to the target
+        end-of-stream marker.
+
+        Returns a tuple of the position of the target in the data and the
+        data *including* the end of stream marker.  Advances the file
+        pointer to a position after the end of the stream.
+
+        The caller is responsible for removing the end-of-stream if
+        necessary (this depends on the filter being used) and parsing
+        the end-of-stream token (likewise) if necessary.
+        """
+        tpos = self.data.find(target, self.pos)
+        if tpos != -1:
+            nextpos = tpos + len(target)
+            result = (tpos, self.data[self.pos : nextpos])
+            self.pos = nextpos
+            return result
+        return (-1, b"")
+
+    def __iter__(self):
+        return self
+
+    def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+        try:
+            return self.__next__()
+        except StopIteration:
+            raise PSEOF
+
+    def __next__(self) -> Tuple[int, PSBaseParserToken]:
+        """Lexer (most of the work is done in regular expressions, but
+        PDF syntax is not entirely regular due to the use of balanced
+        parentheses in strings)."""
+        while True:
+            m = LEXER.match(self.data, self.pos)
+            if m is None:  # can only happen at EOS
+                raise StopIteration
+            self._curtokenpos = m.start()
+            self.pos = m.end()
+            if m.lastgroup not in ("whitespace", "comment"):
+                # Okay, we got a token or something
+                break
+        self._curtoken = m[0]
+        if m.lastgroup == "name":
+            self._curtoken = m[0][1:]
+            self._curtoken = HEXDIGIT.sub(
+                lambda x: bytes((int(x[1], 16),)), self._curtoken
+            )
+            try:
+                tok = LIT(self._curtoken.decode("utf-8"))
+            except UnicodeDecodeError:
+                tok = LIT(self._curtoken)
+            return (self._curtokenpos, tok)
+        if m.lastgroup == "number":
+            if b"." in self._curtoken:
+                return (self._curtokenpos, float(self._curtoken))
+            else:
+                return (self._curtokenpos, int(self._curtoken))
+        if m.lastgroup == "startdict":
+            return (self._curtokenpos, KEYWORD_DICT_BEGIN)
+        if m.lastgroup == "enddict":
+            return (self._curtokenpos, KEYWORD_DICT_END)
+        if m.lastgroup == "startstr":
+            return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end())
+        if m.lastgroup == "hexstr":
+            self._curtoken = SPC.sub(b"", self._curtoken[1:-1])
+            if len(self._curtoken) % 2 == 1:
+                self._curtoken += b"0"
+            return (self._curtokenpos, unhexlify(self._curtoken))
+        # Anything else is treated as a keyword (whether explicitly matched or not)
+        if self._curtoken == b"true":
+            return (self._curtokenpos, True)
+        elif self._curtoken == b"false":
+            return (self._curtokenpos, False)
+        else:
+            return (self._curtokenpos, KWD(self._curtoken))
+
+    def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]:
+        """Parse the remainder of a string."""
+        parts = [start]
+        paren = 1
+        for m in STRLEXER.finditer(self.data, pos):
+            self.pos = m.end()
+            if m.lastgroup == "parenright":
+                paren -= 1
+                if paren == 0:
+                    # By far the most common situation!
+                    break
+                parts.append(m[0])
+            elif m.lastgroup == "parenleft":
+                parts.append(m[0])
+                paren += 1
+            elif m.lastgroup == "escape":
+                chr = m[0][1:2]
+                if chr not in ESC_STRING:
+                    log.warning("Unrecognized escape %r", m[0])
+                    parts.append(chr)
+                else:
+                    parts.append(bytes((ESC_STRING[chr],)))
+            elif m.lastgroup == "octal":
+                chrcode = int(m[0][1:], 8)
+                if chrcode >= 256:
+                    # PDF1.7 p.16: "high-order overflow shall be
+                    # ignored."
+                    log.warning("Invalid octal %r (%d)", m[0][1:], chrcode)
+                else:
+                    parts.append(bytes((chrcode,)))
+            elif m.lastgroup == "linebreak":
+                pass
+            else:
+                parts.append(m[0])
+        if paren != 0:
+            log.warning("Unterminated string at %d", pos)
+            raise StopIteration
+        return (self._curtokenpos, b"".join(EOLR.sub(b"\n", part) for part in parts))
+
+
 # Stack slots may by occupied by any of:
 #  * the name of a literal
 #  * the PSBaseParserToken types
@@ -520,9 +799,17 @@ def _parse_hexstring(self):
 PSStackEntry = Tuple[int, PSStackType[ExtraT]]
 
 
-class PSStackParser(PSBaseParser, Generic[ExtraT]):
-    def __init__(self, reader: BinaryIO) -> None:
-        PSBaseParser.__init__(self, reader)
+class PSStackParser(Generic[ExtraT]):
+    def __init__(self, reader: Union[BinaryIO, bytes]) -> None:
+        self.reinit(reader)
+
+    def reinit(self, reader: Union[BinaryIO, bytes]) -> None:
+        if isinstance(reader, bytes):
+            self._parser: Union[PSInMemoryParser, PSFileParser] = PSInMemoryParser(
+                reader
+            )
+        else:
+            self._parser = PSFileParser(reader)
         self.reset()
 
     def reset(self) -> None:
@@ -532,7 +819,7 @@ def reset(self) -> None:
         self.results: List[PSStackEntry[ExtraT]] = []
 
     def seek(self, pos: int) -> None:
-        PSBaseParser.seek(self, pos)
+        self._parser.seek(pos)
         self.reset()
 
     def push(self, *objs: PSStackEntry[ExtraT]) -> None:
@@ -571,6 +858,9 @@ def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
         pass
 
+    def flush(self) -> None:
+        pass
+
     def nextobject(self) -> PSStackEntry[ExtraT]:
         """Yields a list of objects.
 
@@ -580,7 +870,7 @@ def nextobject(self) -> PSStackEntry[ExtraT]:
         :return: keywords, literals, strings, numbers, arrays and dictionaries.
         """
         while not self.results:
-            (pos, token) = self.nexttoken()
+            (pos, token) = self._parser.nexttoken()
             if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
                 # normal token
                 self.push((pos, token))
@@ -643,10 +933,32 @@ def nextobject(self) -> PSStackEntry[ExtraT]:
             if self.context:
                 continue
             else:
-                self.flush()  # FIXME: what does it do?
+                self.flush()  # Does nothing here, but in subclasses... (ugh)
         obj = self.results.pop(0)
         try:
             log.debug("nextobject: %r", obj)
         except Exception:
             log.debug("nextobject: (unprintable object)")
         return obj
+
+    # Delegation follows
+    def nextline(self) -> Tuple[int, bytes]:
+        return self._parser.nextline()
+
+    def revreadlines(self) -> Iterator[bytes]:
+        return self._parser.revreadlines()
+
+    def read(self, pos: int, objlen: int) -> bytes:
+        return self._parser.read(pos, objlen)
+
+    def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+        return self._parser.nexttoken()
+
+    def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]:
+        return self._parser.get_inline_data(target)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return next(self._parser)
diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py
index a4ba03c0..cb84a20f 100644
--- a/tests/benchmark_parser.py
+++ b/tests/benchmark_parser.py
@@ -271,27 +271,62 @@
 """
 
 
+def bench_bytes():
+    from playa.psparser import PSInMemoryParser
+
+    runs = 100
+    start = time.time()
+    parser = PSInMemoryParser(DATA * runs)
+    _ = list(parser)
+    print(
+        "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000),
+    )
+
+
+def bench_bytesio():
+    from pdfminer.psparser import PSEOF, PSBaseParser
+
+    runs = 100
+    start = time.time()
+    parser = PSBaseParser(BytesIO(DATA * runs))
+    while True:
+        try:
+            _ = parser.nexttoken()
+        except PSEOF:
+            break
+    print(
+        "pdfminer.six Parser (BytesIO): %fms / run"
+        % ((time.time() - start) / runs * 1000),
+    )
+
+
 def bench_playa():
     from playa.converter import PDFPageAggregator
     from playa.pdfdocument import PDFDocument
     from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager
     from playa.pdfpage import PDFPage
-    from playa.psparser import PSBaseParser
+    from playa.psparser import PSFileParser, PSInMemoryParser
 
     runs = 100
     start = time.time()
-    parser = PSBaseParser(BytesIO(DATA * runs))
+    parser = PSFileParser(BytesIO(DATA * runs))
     _ = list(parser)
     print(
         "PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000),
     )
+    start = time.time()
+    parser = PSInMemoryParser(DATA * runs)
+    _ = list(parser)
+    print(
+        "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000),
+    )
     with tempfile.NamedTemporaryFile() as tf:
         runs = 100
         with open(tf.name, "wb") as outfh:
             outfh.write(DATA * runs)
         with open(tf.name, "rb") as infh:
             start = time.time()
-            parser = PSBaseParser(infh)
+            parser = PSFileParser(infh)
             _ = list(parser)
             print(
                 "PLAYA Parser (BinaryIO): %fms / run"
diff --git a/tests/test_open.py b/tests/test_open.py
index df57513f..163e0c93 100644
--- a/tests/test_open.py
+++ b/tests/test_open.py
@@ -34,7 +34,6 @@ def test_open(path: Path):
     for password in passwords:
         with playa.open(TESTDIR / path, password=password) as pdf:
             pass
-        assert pdf.parser.fp.closed
         assert pdf.parser.doc is None
 
 
@@ -51,5 +50,16 @@ def test_inline_data():
         interp.process_page(page)
 
 
+def test_multiple_contents():
+    # See above...
+    with playa.open(TESTDIR / "jo.pdf") as doc:
+        rsrc = PDFResourceManager()
+        agg = PDFPageAggregator(rsrc, pageno=1)
+        interp = PDFPageInterpreter(rsrc, agg)
+        page = next(PDFPage.create_pages(doc))
+        assert len(page.contents) > 1
+        interp.process_page(page)
+
+
 if __name__ == "__main__":
     test_open(TESTDIR / "simple5.pdf")
diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py
index b4d9df97..109b5536 100644
--- a/tests/test_pdfparser.py
+++ b/tests/test_pdfparser.py
@@ -2,16 +2,16 @@
 Test the PDF parser
 """
 
-from io import BytesIO
+import tempfile
 
 from playa.exceptions import PSEOF
-from playa.pdfinterp import get_inline_data
 from playa.psparser import (
     KEYWORD_DICT_BEGIN,
     KEYWORD_DICT_END,
     KWD,
     LIT,
-    PSBaseParser,
+    PSFileParser,
+    PSInMemoryParser,
 )
 
 TESTDATA = b"""
@@ -31,24 +31,54 @@
 ]
 
 
-def test_nextline():
-    """Verify that we replicate the old nextline method."""
-    parser = PSBaseParser(BytesIO(TESTDATA))
-    lines = []
+def run_parsers(data: bytes, expected: list, makefunc):
+    """Test stuff on both BytesIO and BinaryIO."""
+    bp = PSInMemoryParser(data)
+    output = []
+    func = makefunc(bp)
     while True:
         try:
-            linepos, line = parser.nextline()
+            output.append(func())
         except PSEOF:
             break
-        lines.append((linepos, line))
-    assert lines == EXPECTED
+    assert output == expected
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "wb") as outfh:
+            outfh.write(data)
+        with open(tf.name, "rb") as infh:
+            fp = PSFileParser(infh)
+            func = makefunc(fp)
+            output = []
+            while True:
+                try:
+                    output.append(func())
+                except PSEOF:
+                    break
+            assert output == expected
+
+
+def test_nextline():
+    """Verify that we replicate the old nextline method."""
+    run_parsers(TESTDATA, EXPECTED, lambda foo: foo.nextline)
 
 
 def test_revreadlines():
     """Verify that we replicate the old revreadlines method."""
-    parser = PSBaseParser(BytesIO(TESTDATA))
-    lines = list(parser.revreadlines())
-    assert lines == list(reversed([line for pos, line in EXPECTED]))
+    expected = list(reversed([line for pos, line in EXPECTED]))
+
+    def make_next(parser):
+        itor = parser.revreadlines()
+
+        def nextor():
+            try:
+                line = next(itor)
+            except StopIteration:
+                raise PSEOF
+            return line
+
+        return nextor
+
+    run_parsers(TESTDATA, expected, make_next)
 
 
 SIMPLE1 = b"""1 0 obj
@@ -79,18 +109,33 @@ def test_revreadlines():
 ]
 
 
+def list_parsers(data: bytes, expected: list, discard_pos=False):
+    bp = PSInMemoryParser(data)
+    if discard_pos:
+        tokens = [tok for pos, tok in list(bp)]
+    else:
+        tokens = list(bp)
+    assert tokens == expected
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "wb") as outfh:
+            outfh.write(data)
+        with open(tf.name, "rb") as infh:
+            fp = PSFileParser(infh)
+            if discard_pos:
+                tokens = [tok for pos, tok in list(fp)]
+            else:
+                tokens = list(fp)
+            assert tokens == expected
+
+
 def test_new_parser():
     # Do a lot of them to make sure buffering works correctly
-    parser = PSBaseParser(BytesIO(SIMPLE1 * 100))
-    tokens = [tok for pos, tok in list(parser)]
-    assert tokens == SIMPLETOK * 100
+    list_parsers(SIMPLE1 * 100, SIMPLETOK * 100, discard_pos=True)
 
 
 def test_new_parser_eof():
     # Make sure we get a keyword at eof
-    parser = PSBaseParser(BytesIO(SIMPLE1[:-1]))
-    tokens = [tok for pos, tok in list(parser)]
-    assert tokens == SIMPLETOK
+    list_parsers(SIMPLE1[:-1], SIMPLETOK, discard_pos=True)
 
 
 PAGE17 = b"""
@@ -101,105 +146,104 @@ def test_new_parser_eof():
 
 
 def test_new_parser1():
-    parser = PSBaseParser(BytesIO(b"123.456"))
-    assert list(parser) == [(0, 123.456)]
-    parser = PSBaseParser(BytesIO(b"+.013"))
-    assert list(parser) == [(0, 0.013)]
-    parser = PSBaseParser(BytesIO(b"123"))
-    assert list(parser) == [(0, 123)]
-    parser = PSBaseParser(BytesIO(b"true false"))
-    assert list(parser) == [(0, True), (5, False)]
-    parser = PSBaseParser(BytesIO(b"(foobie bletch)"))
-    assert list(parser) == [(0, b"foobie bletch")]
-    parser = PSBaseParser(BytesIO(b"(foo"))  # Invalid string
-    assert list(parser) == []
+    list_parsers(b"123.456", [(0, 123.456)])
+    list_parsers(b"+.013", [(0, 0.013)])
+    list_parsers(b"123", [(0, 123)])
+    list_parsers(b"true false", [(0, True), (5, False)])
+    list_parsers(b"(foobie bletch)", [(0, b"foobie bletch")])
+    list_parsers(b"(foo", [])
 
 
 def test_new_parser_names():
     # Examples from PDF 1.7 page 17
-    parser = PSBaseParser(BytesIO(PAGE17))
-    tokens = list(parser)
-    assert tokens == [
-        (5, LIT("A;Name_With-Various***Characters?")),
-        (44, LIT("lime Green")),
-        (62, LIT("paired()parentheses")),
-    ]
+    list_parsers(
+        PAGE17,
+        [
+            (5, LIT("A;Name_With-Various***Characters?")),
+            (44, LIT("lime Green")),
+            (62, LIT("paired()parentheses")),
+        ],
+    )
 
 
 def test_new_parser_strings():
-    parser = PSBaseParser(
-        BytesIO(
-            rb"( Strings may contain balanced parentheses ( ) and "
-            rb"special characters ( * ! & } ^ % and so on ) . )"
-        )
+    list_parsers(
+        rb"( Strings may contain balanced parentheses ( ) and "
+        rb"special characters ( * ! & } ^ % and so on ) . )",
+        [
+            (
+                0,
+                rb" Strings may contain balanced parentheses ( ) and "
+                rb"special characters ( * ! & } ^ % and so on ) . ",
+            )
+        ],
     )
-    assert list(parser) == [
-        (
-            0,
-            rb" Strings may contain balanced parentheses ( ) and "
-            rb"special characters ( * ! & } ^ % and so on ) . ",
-        )
-    ]
-    parser = PSBaseParser(BytesIO(b"()"))
-    assert list(parser) == [(0, b"")]
-    parser = PSBaseParser(
-        BytesIO(
-            rb"""( These \
+    list_parsers(b"()", [(0, b"")])
+    list_parsers(
+        rb"""( These \
 two strings \
 are the same . )
-    """
-        )
+    """,
+        [(0, b" These two strings are the same . ")],
+    )
+    list_parsers(b"(foo\rbar)", [(0, b"foo\nbar")])
+    list_parsers(b"(foo\r)", [(0, b"foo\n")])
+    list_parsers(b"(foo\r\nbaz)", [(0, b"foo\nbaz")])
+    list_parsers(b"(foo\n)", [(0, b"foo\n")])
+    list_parsers(
+        rb"( This string contains \245two octal characters\307 . )",
+        [(0, b" This string contains \245two octal characters\307 . ")],
     )
-    assert list(parser) == [(0, b" These two strings are the same . ")]
-    parser = PSBaseParser(BytesIO(b"(foo\rbar)"))
-    assert list(parser) == [(0, b"foo\nbar")]
-    parser = PSBaseParser(BytesIO(b"(foo\r)"))
-    assert list(parser) == [(0, b"foo\n")]
-    parser = PSBaseParser(BytesIO(b"(foo\r\nbaz)"))
-    assert list(parser) == [(0, b"foo\nbaz")]
-    parser = PSBaseParser(BytesIO(b"(foo\n)"))
-    assert list(parser) == [(0, b"foo\n")]
-    parser = PSBaseParser(
-        BytesIO(rb"( This string contains \245two octal characters\307 . )")
+    list_parsers(rb"(\0053 \053 \53)", [(0, b"\0053 \053 +")])
+    list_parsers(
+        rb"< 4E6F762073686D6F7A206B6120706F702E >", [(0, b"Nov shmoz ka pop.")]
     )
-    assert list(parser) == [
-        (0, b" This string contains \245two octal characters\307 . ")
-    ]
-    parser = PSBaseParser(BytesIO(rb"(\0053 \053 \53)"))
-    assert list(parser) == [(0, b"\0053 \053 +")]
-    parser = PSBaseParser(BytesIO(rb"< 4E6F762073686D6F7A206B6120706F702E >"))
-    assert list(parser) == [(0, b"Nov shmoz ka pop.")]
-    parser = PSBaseParser(BytesIO(rb"<73 686 D6F7A2>"))
-    assert list(parser) == [(0, b"shmoz ")]
-    parser = PSBaseParser(BytesIO(rb"(\400)"))
-    assert list(parser) == [(0, b"")]
+    list_parsers(rb"<73 686 D6F7A2>", [(0, b"shmoz ")])
+    list_parsers(rb"(\400)", [(0, b"")])
 
 
 def test_invalid_strings_eof():
-    parser = PSBaseParser(BytesIO(rb"(\00"))
-    assert list(parser) == []
-    parser = PSBaseParser(BytesIO(rb"(abracadab"))
-    assert list(parser) == []
-    parser = PSBaseParser(BytesIO(rb"<73686"))
-    assert list(parser) == []
+    list_parsers(rb"(\00", [])
+    list_parsers(rb"(abracadab", [])
+
+
+def inline_parsers(
+    data: bytes, expected: tuple, target=b"EI", nexttoken=None, blocksize=16
+):
+    bp = PSInMemoryParser(data)
+    assert bp.get_inline_data(target=target, blocksize=blocksize) == expected
+    if nexttoken is not None:
+        assert bp.nexttoken() == nexttoken
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "wb") as outfh:
+            outfh.write(data)
+        with open(tf.name, "rb") as infh:
+            fp = PSFileParser(infh)
+            assert fp.get_inline_data(target=target, blocksize=blocksize) == expected
+            if nexttoken is not None:
+                assert fp.nexttoken() == nexttoken
 
 
 def test_get_inline_data():
-    fp = BytesIO(b"""0123456789EI""")
-    assert get_inline_data(fp) == (10, b"0123456789EI")
-    fp = BytesIO(b"""0123456789EIEIO""")
-    assert get_inline_data(fp) == (10, b"0123456789EI")
-    assert fp.read(3) == b"EIO"
-    fp = BytesIO(b"""012EIEIO""")
-    assert get_inline_data(fp, blocksize=4) == (3, b"012EI")
-    assert fp.read(3) == b"EIO"
-    fp = BytesIO(b"""0123012EIEIO""")
-    assert get_inline_data(fp, blocksize=4) == (7, b"0123012EI")
-    assert fp.read(3) == b"EIO"
+    kwd_eio = KWD(b"EIO")
+    kwd_omg = KWD(b"OMG")
+    inline_parsers(b"""0123456789""", (-1, b""))
+    inline_parsers(b"""0123456789EI""", (10, b"0123456789EI"))
+    inline_parsers(
+        b"""0123456789EIEIO""", (10, b"0123456789EI"), nexttoken=(12, kwd_eio)
+    )
+    inline_parsers(b"""012EIEIO""", (3, b"012EI"), nexttoken=(5, kwd_eio), blocksize=4)
+    inline_parsers(
+        b"""0123012EIEIO""", (7, b"0123012EI"), nexttoken=(9, kwd_eio), blocksize=4
+    )
     for blocksize in range(1, 8):
-        fp = BytesIO(b"""012EIEIOOMG""")
-        assert get_inline_data(fp, blocksize=blocksize, target=b"EIEIO") == (
-            3,
-            b"012EIEIO",
+        inline_parsers(
+            b"""012EIEIOOMG""",
+            (
+                3,
+                b"012EIEIO",
+            ),
+            target=b"EIEIO",
+            nexttoken=(8, kwd_omg),
+            blocksize=blocksize,
         )
-        assert fp.read(3) == b"OMG"