Skip to content

Commit

Permalink
Implement in-memory parsing (#6)
Browse files Browse the repository at this point in the history
* refactor: put get_inline_data where it can be overridden

* fix: handle case where eos not found

* feat: in-memory lexer which is Very Fast

* test: fully test both parsers

* fix: make it work again

* fix: eliminate inheritance

* feat: accept bytes directly

* chore: format

* fix: simplify and fix PDFContentParser

* feat: implement switch between file/memory parsers
  • Loading branch information
dhdaines authored Sep 19, 2024
1 parent 1394c50 commit 37f3b2e
Show file tree
Hide file tree
Showing 8 changed files with 569 additions and 211 deletions.
5 changes: 2 additions & 3 deletions playa/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import sys
from typing import (
Any,
BinaryIO,
Dict,
Iterable,
Iterator,
Expand Down Expand Up @@ -277,8 +276,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:


class CMapParser(PSStackParser[PSKeyword]):
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
def __init__(self, cmap: CMapBase, data: bytes) -> None:
super().__init__(data)
self.cmap = cmap
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
Expand Down
10 changes: 5 additions & 5 deletions playa/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ class Type1FontHeaderParser(PSStackParser[int]):
KEYWORD_READONLY = KWD(b"readonly")
KEYWORD_FOR = KWD(b"for")

def __init__(self, data: BinaryIO) -> None:
PSStackParser.__init__(self, data)
def __init__(self, data: bytes) -> None:
super().__init__(data)
self._cid2unicode: Dict[int, str] = {}

def get_encoding(self) -> Dict[int, str]:
Expand Down Expand Up @@ -968,7 +968,7 @@ def __init__(
if "ToUnicode" in spec:
strm = stream_value(spec["ToUnicode"])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
CMapParser(self.unicode_map, strm.get_data()).run()
PDFFont.__init__(self, descriptor, widths)

def to_unichr(self, cid: int) -> str:
Expand Down Expand Up @@ -1008,7 +1008,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No
self.fontfile = stream_value(descriptor.get("FontFile"))
length1 = int_value(self.fontfile["Length1"])
data = self.fontfile.get_data()[:length1]
parser = Type1FontHeaderParser(BytesIO(data))
parser = Type1FontHeaderParser(data)
self.cid2unicode = parser.get_encoding()

def __repr__(self) -> str:
Expand Down Expand Up @@ -1079,7 +1079,7 @@ def __init__(
if isinstance(spec["ToUnicode"], PDFStream):
strm = stream_value(spec["ToUnicode"])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
CMapParser(self.unicode_map, strm.get_data()).run()
else:
cmap_name = literal_name(spec["ToUnicode"])
encoding = literal_name(spec["Encoding"])
Expand Down
122 changes: 41 additions & 81 deletions playa/pdfinterp.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import io
import logging
from io import BytesIO
from typing import BinaryIO, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast

from playa import settings
from playa.casting import safe_float
from playa.cmapdb import CMap, CMapBase, CMapDB
from playa.exceptions import PSEOF, PDFException, PDFValueError, PSTypeError
from playa.exceptions import (
PSEOF,
PDFException,
PDFSyntaxError,
PDFValueError,
PSTypeError,
)
from playa.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
from playa.pdfdevice import PDFDevice, PDFTextSeq
from playa.pdffont import (
Expand All @@ -30,6 +34,7 @@
from playa.psparser import (
KWD,
LIT,
PSBaseParserToken,
PSKeyword,
PSLiteral,
PSStackParser,
Expand Down Expand Up @@ -252,85 +257,38 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
KEYWORD_EI = KWD(b"EI")


def get_inline_data(
fp: BinaryIO, target: bytes = b"EI", blocksize: int = 4096
) -> Tuple[int, bytes]:
"""Get the data for an inline image up to the target
end-of-stream marker.
Returns a tuple of the position of the target in the data and the
data *including* the end of stream marker. Advances the file
pointer to a position after the end of the stream.
The caller is responsible for removing the end-of-stream if
necessary (this depends on the filter being used) and parsing
the end-of-stream token (likewise) if necessary.
class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
"""Parse the concatenation of multiple content streams, as
described in the spec (PDF 1.7, p.86):
...the effect shall be as if all of the streams in the array were
concatenated, in order, to form a single stream. Conforming
writers can create image objects and other resources as they
occur, even though they interrupt the content stream. The division
between streams may occur only at the boundaries between lexical
tokens (see 7.2, "Lexical Conventions") but shall be unrelated to
the page’s logical content or organization.
"""
# PDF 1.7, p. 216: The bytes between the ID and EI operators
# shall be treated the same as a stream object’s data (see
# 7.3.8, "Stream Objects"), even though they do not follow the
# standard stream syntax.
data = [] # list of blocks
partial = b"" # partially seen target
pos = 0
while True:
# Did we see part of the target at the end of the last
# block? Then scan ahead and try to find the rest (we
# assume the stream is buffered)
if partial:
extra_len = len(target) - len(partial)
extra = fp.read(extra_len)
if partial + extra == target:
pos -= len(partial)
data.append(extra)
break
# Put it back (assume buffering!)
fp.seek(-extra_len, io.SEEK_CUR)
partial = b""
# Fall through (the target could be at the beginning)
buf = fp.read(blocksize)
tpos = buf.find(target)
if tpos != -1:
data.append(buf[: tpos + len(target)])
# Put the extra back (assume buffering!)
fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR)
pos += tpos
break
else:
pos += len(buf)
# look for the longest partial match at the end
plen = len(target) - 1
while plen > 0:
ppos = len(buf) - plen
if buf[ppos:] == target[:plen]:
partial = buf[ppos:]
break
plen -= 1
data.append(buf)
return (pos, b"".join(data))


class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
def __init__(self, streams: Sequence[object]) -> None:
self.streams = streams
self.istream = 0
# PSStackParser.__init__(fp=None) is safe only because we've overloaded
# all the methods that would attempt to access self.fp without first
# calling self.fillfp().
PSStackParser.__init__(self, None) # type: ignore[arg-type]

def fillfp(self) -> None:
if not self.fp:
if self.istream < len(self.streams):
strm = stream_value(self.streams[self.istream])
self.istream += 1
else:
raise PSEOF("Unexpected EOF, file truncated?")
self.fp = BytesIO(strm.get_data())
self.streamiter = iter(streams)
try:
stream = stream_value(next(self.streamiter))
except StopIteration:
raise PSEOF
log.debug("PDFContentParser starting stream %r", stream)
super().__init__(stream.get_data())

def seek(self, pos: int) -> None:
self.fillfp()
super().seek(pos)
def __next__(self) -> Tuple[int, PSBaseParserToken]:
while True:
try:
return super().__next__()
except StopIteration:
# Will also raise StopIteration if there are no more,
# which is exactly what we want
stream = stream_value(next(self.streamiter))
log.debug("PDFContentParser starting stream %r", stream)
self.reinit(stream.get_data())

def flush(self) -> None:
self.add_results(*self.popall())
Expand Down Expand Up @@ -360,7 +318,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
# interpreted as the first byte of image data.
if eos == b"EI":
self.seek(pos + len(token.name) + 1)
(pos, data) = get_inline_data(self.fp, target=eos)
(pos, data) = self.get_inline_data(target=eos)
# FIXME: it is totally unspecified what to do with
# a newline between the end of the data and "EI",
# since there is no explicit stream length. (PDF
Expand All @@ -371,7 +329,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
data = data[: -len(eos)]
else:
self.seek(pos + len(token.name))
(pos, data) = get_inline_data(self.fp, target=eos)
(pos, data) = self.get_inline_data(target=eos)
if pos == -1:
raise PDFSyntaxError("End of inline stream %r not found" % eos)
obj = PDFStream(d, data)
self.push((pos, obj))
# This was included in the data but we need to "parse" it
Expand Down
10 changes: 4 additions & 6 deletions playa/pdfparser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
from io import BytesIO
from typing import TYPE_CHECKING, BinaryIO, Optional, Union

from playa import settings
Expand Down Expand Up @@ -40,8 +39,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
"""

def __init__(self, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
def __init__(self, data: Union[BinaryIO, bytes]) -> None:
super().__init__(data)
self.doc: Optional[PDFDocument] = None
self.fallback = False

Expand Down Expand Up @@ -89,8 +88,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
raise PDFSyntaxError("Unexpected EOF")
return
pos += len(line)
self.fp.seek(pos)
data = bytearray(self.fp.read(objlen))
data = bytearray(self.read(pos, objlen))
self.seek(pos + objlen)
while True:
try:
Expand Down Expand Up @@ -135,7 +133,7 @@ class PDFStreamParser(PDFParser):
"""

def __init__(self, data: bytes) -> None:
PDFParser.__init__(self, BytesIO(data))
super().__init__(data)

def flush(self) -> None:
self.add_results(*self.popall())
Expand Down
Loading

0 comments on commit 37f3b2e

Please sign in to comment.