Skip to content

Commit

Permalink
Simpler, character-state-machine based "parser" (#4)
Browse files Browse the repository at this point in the history
* feat: rewrite the parser to not do its own buffering

* test: fix tests

* fix: miscellaneous

* fix: go back to using fp/BinaryIO

* test: fix tests

* fix(test): BufferedReader not necessary

* fix: read_header broke PDFStreamParser

* fix: address mypy issues

* chore: format

* ci: add basic ci

* fix: clean up and correct get_inline_data

* fix: better benchmark

* fix: better better benchmark (we are slower stil)

* feat: comparisons in benchmark

* feat: distinguish BinaryIO and BytesIO in benchmark
  • Loading branch information
dhdaines authored Sep 18, 2024
1 parent 4da932a commit 1394c50
Show file tree
Hide file tree
Showing 16 changed files with 1,057 additions and 419 deletions.
20 changes: 20 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: Run all tests
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install Hatch
uses: pypa/hatch@install
- name: Run tests
run: hatch test
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ Pipfile.lock
.vscode/
poetry.lock
.eggs
*~
1 change: 0 additions & 1 deletion playa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from typing import Iterator

from playa.pdfdocument import PDFDocument
from playa.pdfparser import PDFParser

__version__ = "0.0.1"

Expand Down
4 changes: 1 addition & 3 deletions playa/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ def _parse(self) -> List[Tuple[int, Any]]:

return items

values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy

@property # type: ignore[no-redef,misc]
@property
def values(self) -> List[Tuple[int, Any]]:
values = self._parse()

Expand Down
31 changes: 28 additions & 3 deletions playa/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
PDFSyntaxError,
PDFTypeError,
)
from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser, read_header
from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
from playa.pdftypes import (
DecipherCallable,
PDFStream,
Expand Down Expand Up @@ -629,6 +629,27 @@ def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
}


def read_header(fp: BinaryIO) -> str:
"""Read the PDF header and return the (initial) version string.
Note that this version can be overridden in the document catalog."""
try:
hdr = fp.read(8)
except IOError as err:
raise PDFSyntaxError("Failed to read PDF header") from err
if not hdr.startswith(b"%PDF-"):
raise PDFSyntaxError("Expected b'%%PDF-', got %r, is this a PDF?" % hdr)
try:
version = hdr[5:].decode("ascii")
except UnicodeDecodeError as err:
raise PDFSyntaxError(
"Version number in %r contains non-ASCII characters" % hdr
) from err
if not re.match(r"\d\.\d", version):
raise PDFSyntaxError("Version number in %r is invalid" % hdr)
return version


class PDFDocument:
"""Representation of a PDF document on disk.
Expand Down Expand Up @@ -670,6 +691,7 @@ def __init__(
self.decipher: Optional[DecipherCallable] = None
self._cached_objs: Dict[int, Tuple[object, int]] = {}
self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
self.pdf_version = read_header(fp)
self.parser = PDFParser(fp)
self.parser.set_document(self) # FIXME: annoying circular reference
self.is_printable = self.is_modifiable = self.is_extractable = True
Expand Down Expand Up @@ -818,6 +840,7 @@ def getobj(self, objid: int) -> object:
if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid]
else:
obj = None
for xref in self.xrefs:
try:
(strmid, index, genno) = xref.get_pos(objid)
Expand All @@ -837,7 +860,7 @@ def getobj(self, objid: int) -> object:
break
except (PSEOF, PDFSyntaxError):
continue
else:
if obj is None:
raise PDFObjectNotFound(objid)
log.debug("register: objid=%r: %r", objid, obj)
self._cached_objs[objid] = (obj, genno)
Expand Down Expand Up @@ -871,7 +894,9 @@ def get_page_labels(self) -> Iterator[str]:
If the document includes page labels, generates strings, one per page.
If not, raises PDFNoPageLabels.
The resulting iteration is unbounded.
The resulting iterator is unbounded, so it is recommended to
zip it with the iterator over actual pages returned by `get_pages`.
"""
assert self.catalog is not None

Expand Down
149 changes: 90 additions & 59 deletions playa/pdfinterp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import io
import logging
import re
from io import BytesIO
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
from typing import BinaryIO, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast

from playa import settings
from playa.casting import safe_float
Expand Down Expand Up @@ -247,6 +247,69 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
return font


KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b"EI")


def get_inline_data(
fp: BinaryIO, target: bytes = b"EI", blocksize: int = 4096
) -> Tuple[int, bytes]:
"""Get the data for an inline image up to the target
end-of-stream marker.
Returns a tuple of the position of the target in the data and the
data *including* the end of stream marker. Advances the file
pointer to a position after the end of the stream.
The caller is responsible for removing the end-of-stream if
necessary (this depends on the filter being used) and parsing
the end-of-stream token (likewise) if necessary.
"""
# PDF 1.7, p. 216: The bytes between the ID and EI operators
# shall be treated the same as a stream object’s data (see
# 7.3.8, "Stream Objects"), even though they do not follow the
# standard stream syntax.
data = [] # list of blocks
partial = b"" # partially seen target
pos = 0
while True:
# Did we see part of the target at the end of the last
# block? Then scan ahead and try to find the rest (we
# assume the stream is buffered)
if partial:
extra_len = len(target) - len(partial)
extra = fp.read(extra_len)
if partial + extra == target:
pos -= len(partial)
data.append(extra)
break
# Put it back (assume buffering!)
fp.seek(-extra_len, io.SEEK_CUR)
partial = b""
# Fall through (the target could be at the beginning)
buf = fp.read(blocksize)
tpos = buf.find(target)
if tpos != -1:
data.append(buf[: tpos + len(target)])
# Put the extra back (assume buffering!)
fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR)
pos += tpos
break
else:
pos += len(buf)
# look for the longest partial match at the end
plen = len(target) - 1
while plen > 0:
ppos = len(buf) - plen
if buf[ppos:] == target[:plen]:
partial = buf[ppos:]
break
plen -= 1
data.append(buf)
return (pos, b"".join(data))


class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
def __init__(self, streams: Sequence[object]) -> None:
self.streams = streams
Expand All @@ -267,65 +330,16 @@ def fillfp(self) -> None:

def seek(self, pos: int) -> None:
self.fillfp()
PSStackParser.seek(self, pos)

def fillbuf(self) -> None:
if self.charpos < len(self.buf):
return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf:
break
self.fp = None # type: ignore[assignment]
self.charpos = 0

def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
self.seek(pos)
i = 0
data = b""
while i <= len(target):
self.fillbuf()
if i:
ci = self.buf[self.charpos]
c = bytes((ci,))
data += c
self.charpos += 1
if (
len(target) <= i
and c.isspace()
or i < len(target)
and c == (bytes((target[i],)))
):
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
data += self.buf[self.charpos : j + 1]
self.charpos = j + 1
i = 1
except ValueError:
data += self.buf[self.charpos :]
self.charpos = len(self.buf)
data = data[: -(len(target) + 1)] # strip the last part
data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
return (pos, data)
super().seek(pos)

def flush(self) -> None:
self.add_results(*self.popall())

KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b"EI")

def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BI:
if token is KEYWORD_BI:
# inline image within a content stream
self.start_type(pos, "inline")
elif token is self.KEYWORD_ID:
elif token is KEYWORD_ID:
try:
(_, objs) = self.end_type("inline")
if len(objs) % 2 != 0:
Expand All @@ -339,13 +353,30 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
filter = [filter]
if filter[0] in LITERALS_ASCII85_DECODE:
eos = b"~>"
(pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
if eos != b"EI": # it may be necessary for decoding
data += eos
# PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode
# or ASCII85Decode as one of its filters, the ID
# operator shall be followed by a single white-space
# character, and the next character shall be
# interpreted as the first byte of image data.
if eos == b"EI":
self.seek(pos + len(token.name) + 1)
(pos, data) = get_inline_data(self.fp, target=eos)
# FIXME: it is totally unspecified what to do with
# a newline between the end of the data and "EI",
# since there is no explicit stream length. (PDF
# 1.7 p. 756: There should be an end-of-line
# marker after the data and before endstream; this
# marker shall not be included in the stream
# length.)
data = data[: -len(eos)]
else:
self.seek(pos + len(token.name))
(pos, data) = get_inline_data(self.fp, target=eos)
obj = PDFStream(d, data)
self.push((pos, obj))
if eos == b"EI": # otherwise it is still in the stream
self.push((pos, self.KEYWORD_EI))
# This was included in the data but we need to "parse" it
if eos == b"EI":
self.push((pos, KEYWORD_EI))
except PSTypeError:
if settings.STRICT:
raise
Expand Down
14 changes: 7 additions & 7 deletions playa/pdfpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple

from playa import settings
from playa.exceptions import PDFObjectNotFound, PDFValueError
from playa.pdfdocument import (
PDFDocument,
from playa.exceptions import (
PDFNoPageLabels,
PDFObjectNotFound,
PDFTextExtractionNotAllowed,
PDFValueError,
)
from playa.pdfdocument import (
PDFDocument,
)
from playa.pdfparser import PDFParser
from playa.pdftypes import dict_value, int_value, list_value, resolve1
from playa.psparser import LIT
from playa.utils import parse_rect
Expand Down Expand Up @@ -173,10 +175,8 @@ def get_pages(
caching: bool = True,
check_extractable: bool = False,
) -> Iterator["PDFPage"]:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
doc = PDFDocument(fp, password=password)
# Check if the document allows text extraction.
# If not, warn the user and proceed.
if not doc.is_extractable:
Expand Down
25 changes: 1 addition & 24 deletions playa/pdfparser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import re
from io import BytesIO
from typing import TYPE_CHECKING, BinaryIO, Optional, Union

Expand All @@ -24,27 +23,6 @@
KEYWORD_OBJ = KWD(b"obj")


def read_header(fp: BinaryIO) -> str:
"""Read the PDF header and return the (initial) version string.
Note that this version can be overridden in the document catalog."""
try:
hdr = fp.read(8)
except IOError as err:
raise PDFSyntaxError("Failed to read PDF header") from err
if not hdr.startswith(b"%PDF-"):
raise PDFSyntaxError("Expected b'%%PDF-', got %r, is this a PDF?" % hdr)
try:
version = hdr[5:].decode("ascii")
except UnicodeDecodeError as err:
raise PDFSyntaxError(
"Version number in %r contains non-ASCII characters" % hdr
) from err
if not re.match(r"\d\.\d", version):
raise PDFSyntaxError("Version number in %r is invalid" % hdr)
return version


# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
"""PDFParser fetch PDF objects from a file stream.
Expand All @@ -65,7 +43,6 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
def __init__(self, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
self.doc: Optional[PDFDocument] = None
self.pdf_version = read_header(fp)
self.fallback = False

def set_document(self, doc: Union["PDFDocument", None]) -> None:
Expand Down Expand Up @@ -115,7 +92,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
self.fp.seek(pos)
data = bytearray(self.fp.read(objlen))
self.seek(pos + objlen)
while 1:
while True:
try:
(linepos, line) = self.nextline()
except PSEOF:
Expand Down
Loading

0 comments on commit 1394c50

Please sign in to comment.