Skip to content

Commit

Permalink
fix: require a newline before EI to fix various inline images (#25)
Browse files Browse the repository at this point in the history
it might be good to check what other PDF readers do as the spec
is super unclear on this and unfortunately some creators do
make inline images with "EI" in the data and no filter...
  • Loading branch information
dhdaines authored Nov 28, 2024
1 parent 87a7307 commit fc7eea9
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 12 deletions.
28 changes: 17 additions & 11 deletions playa/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def nextline(self) -> Tuple[int, bytes]:
return (linepos, self.data[linepos : self.pos])

def get_inline_data(
self, target: bytes = b"EI", blocksize: int = -1
self, target: bytes = b"\nEI", blocksize: int = -1
) -> Tuple[int, bytes]:
"""Get the data for an inline image up to the target
end-of-stream marker.
Expand Down Expand Up @@ -349,6 +349,7 @@ def __next__(self) -> StackEntry:
if self.stack and top is None:
return self.stack.pop()
(pos, token) = self.nexttoken()
log.debug("token at %d: %r", pos, token)
if token is KEYWORD_ARRAY_BEGIN:
if top is None:
top = pos
Expand Down Expand Up @@ -429,7 +430,7 @@ def __next__(self) -> StackEntry:
dic = {
literal_name(k): v for (k, v) in choplist(2, objs) if v is not None
}
eos = b"EI"
eos = b"\nEI"
filter = dic.get("F")
if filter is not None:
if not isinstance(filter, list):
Expand All @@ -441,16 +442,19 @@ def __next__(self) -> StackEntry:
# operator shall be followed by a single white-space
# character, and the next character shall be
# interpreted as the first byte of image data.
if eos == b"EI":
if eos == b"\nEI":
self.seek(idpos + len(KEYWORD_ID.name) + 1)
(eipos, data) = self.get_inline_data(target=eos)
# FIXME: it is totally unspecified what to do with
# a newline between the end of the data and "EI",
# since there is no explicit stream length. (PDF
# 1.7 p. 756: There should be an end-of-line
# marker after the data and before endstream; this
# marker shall not be included in the stream
# length.) We will include it, which might be wrong.
log.debug("data at %d: %r", eipos, data)
# It is totally unspecified what to do with a
# newline between the end of the data and "EI",
# since there is no explicit stream length, but in
# practice, there is always a newline, since "EI"
# may occur in the stream data even when it is not
# ASCII85, so we will not include it. (PDF 1.7
# p. 756: There *should* be an end-of-line marker
# after the data and before endstream; this marker
# shall not be included in the stream length.)
data = data[: -len(eos)]
else:
# Note absence of + 1 here (the "Unless" above)
Expand All @@ -468,7 +472,9 @@ def __next__(self) -> StackEntry:
log.debug("InlineImage @ %d: %r", pos, obj)
# Inline images must occur at the top level, otherwise
# something is wrong (probably a corrupt file)
assert pos == top, f"Inline image {obj} not at top level of stream"
assert (
pos == top
), f"Inline image {obj} not at top level of stream ({pos} != {top}, {self.stack})"
top = None
return pos, obj
else:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_object_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def test_inline_images():
pos, img = next(parser)
assert isinstance(img, InlineImage)
assert img.attrs["Foo"] == b"bar"
assert img.rawdata == b"VARIOUS UTTER NONSENSE\n"
assert img.rawdata == b"VARIOUS UTTER NONSENSE"
pos, img = next(parser)
assert isinstance(img, InlineImage)
assert img.buffer == b"VARIOUS UTTER NONSENSE"

0 comments on commit fc7eea9

Please sign in to comment.