Skip to content

Commit

Permalink
Add various things desired by pdfplumber (#9)
Browse files Browse the repository at this point in the history
* feat: track MCIDs and tags in layout analysis

* fix: wrong tpye

* fix: expose ncs/scs everywhere

* fix: fix types

* fix: handle parsed_pages for pdfplumber

* feat: improved NumberTree

* feat: do page labels without flattening the entire tree

* docs: update PDF 1.7 section reference
  • Loading branch information
dhdaines authored Oct 1, 2024
1 parent 336c69a commit a0783c7
Show file tree
Hide file tree
Showing 10 changed files with 228 additions and 110 deletions.
51 changes: 41 additions & 10 deletions playa/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from playa.layout import (
LAParams,
LTComponent,
LTChar,
LTCurve,
LTFigure,
Expand All @@ -21,7 +22,8 @@
from playa.pdfcolor import PDFColorSpace
from playa.pdfdevice import PDFTextDevice
from playa.pdffont import PDFFont, PDFUnicodeNotDefined
from playa.pdfinterp import PDFGraphicState, PDFResourceManager
from playa.pdfinterp import PDFGraphicState, PDFResourceManager, PDFStackT
from playa.psparser import PSLiteral
from playa.pdfpage import PDFPage
from playa.pdftypes import PDFStream
from playa.utils import (
Expand All @@ -30,6 +32,7 @@
Point,
Rect,
apply_matrix_pt,
decode_text,
mult_matrix,
)

Expand All @@ -39,6 +42,8 @@
class PDFLayoutAnalyzer(PDFTextDevice):
cur_item: LTLayoutContainer
ctm: Matrix
cur_mcid: Optional[int] = None
cur_tag: Optional[str] = None

def __init__(
self,
Expand Down Expand Up @@ -76,14 +81,32 @@ def end_figure(self, _: str) -> None:
self.cur_item = self._stack.pop()
self.cur_item.add(fig)

def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
"""Handle beginning of tag, setting current MCID if any."""
self.cur_tag = decode_text(tag.name)
if isinstance(props, dict) and "MCID" in props:
self.cur_mcid = props["MCID"]
else:
self.cur_mcid = None

def end_tag(self) -> None:
"""Handle beginning of tag, clearing current MCID."""
self.cur_tag = None
self.cur_mcid = None

def add_item(self, item: LTComponent) -> None:
item.mcid = self.cur_mcid
item.tag = self.cur_tag
self.cur_item.add(item)

def render_image(self, name: str, stream: PDFStream) -> None:
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
item = LTImage(
name,
stream,
(self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
)
self.cur_item.add(item)
self.add_item(item)

def paint_path(
self,
Expand All @@ -92,6 +115,8 @@ def paint_path(
fill: bool,
evenodd: bool,
path: Sequence[PathSegment],
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
"""Paint paths described in section 4.4 of the PDF reference manual"""
shape = "".join(x[0] for x in path)
Expand All @@ -109,7 +134,7 @@ def paint_path(
# recurse if there are multiple m's in this shape
for m in re.finditer(r"m[^m]+", shape):
subpath = path[m.start(0) : m.end(0)]
self.paint_path(gstate, stroke, fill, evenodd, subpath)
self.paint_path(gstate, stroke, fill, evenodd, subpath, ncs, scs)

else:
# Although the 'h' command does not not literally provide a
Expand Down Expand Up @@ -153,8 +178,9 @@ def paint_path(
gstate.ncolor,
original_path=transformed_path,
dashing_style=gstate.dash,
ncs=ncs, scs=scs
)
self.cur_item.add(line)
self.add_item(line)

elif shape in {"mlllh", "mllll"}:
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
Expand All @@ -174,8 +200,9 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
)
self.cur_item.add(rect)
self.add_item(rect)
else:
curve = LTCurve(
gstate.linewidth,
Expand All @@ -187,8 +214,9 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
)
self.cur_item.add(curve)
self.add_item(curve)
else:
curve = LTCurve(
gstate.linewidth,
Expand All @@ -200,8 +228,9 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
)
self.cur_item.add(curve)
self.add_item(curve)

def render_char(
self,
Expand All @@ -211,8 +240,9 @@ def render_char(
scaling: float,
rise: float,
cid: int,
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> float:
try:
text = font.to_unichr(cid)
Expand All @@ -230,10 +260,11 @@ def render_char(
text,
textwidth,
textdisp,
ncs,
graphicstate,
ncs,
scs,
)
self.cur_item.add(item)
self.add_item(item)
return item.adv

def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
Expand Down
50 changes: 27 additions & 23 deletions playa/data_structures.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,50 @@
from typing import Any, Iterable, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Tuple

from playa import settings
from playa.pdfparser import PDFSyntaxError
from playa.pdftypes import dict_value, int_value, list_value
from playa.utils import choplist


def walk_number_tree(tree: Dict[str, Any]) -> Iterator[Tuple[int, Any]]:
stack = [tree]
while stack:
item = dict_value(stack.pop())
if "Nums" in item:
for k, v in choplist(2, list_value(item["Nums"])):
yield int_value(k), v
if "Kids" in item:
stack.extend(reversed(list_value(item["Kids"])))


class NumberTree:
"""A PDF number tree.
See Section 3.8.6 of the PDF Reference.
See Section 7.9.7 of the PDF 1.7 Reference.
"""

def __init__(self, obj: Any):
self._obj = dict_value(obj)
self.nums: Optional[Iterable[Any]] = None
self.kids: Optional[Iterable[Any]] = None
self.limits: Optional[Iterable[Any]] = None

if "Nums" in self._obj:
self.nums = list_value(self._obj["Nums"])
if "Kids" in self._obj:
self.kids = list_value(self._obj["Kids"])
if "Limits" in self._obj:
self.limits = list_value(self._obj["Limits"])
def __iter__(self) -> Iterator[Tuple[int, Any]]:
return walk_number_tree(self._obj)

def _parse(self) -> List[Tuple[int, Any]]:
items = []
if self.nums: # Leaf node
for k, v in choplist(2, self.nums):
items.append((int_value(k), v))
def __contains__(self, num) -> bool:
for idx, val in self:
if idx == num:
return True
return False

if self.kids: # Root or intermediate node
for child_ref in self.kids:
items += NumberTree(child_ref)._parse()

return items
def __getitem__(self, num) -> Any:
for idx, val in self:
if idx == num:
return val
raise IndexError(f"Number {num} not in tree")

@property
def values(self) -> List[Tuple[int, Any]]:
values = self._parse()

values = list(self)
# NOTE: They are supposed to be sorted! (but, I suppose, often aren't)
if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError("Number tree elements are out of order")
Expand Down
19 changes: 18 additions & 1 deletion playa/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def __repr__(self) -> str:
class LTItem:
"""Interface for things that can be analyzed"""

# Any item could be in a marked content section
mcid: Optional[int] = None
# Which could have a tag
tag: Optional[str] = None

def analyze(self, laparams: LAParams) -> None:
"""Perform the layout analysis."""

Expand Down Expand Up @@ -234,9 +239,13 @@ def __init__(
non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
LTComponent.__init__(self, get_bound(pts))
self.pts = pts
self.ncs = ncs
self.scs = scs
self.linewidth = linewidth
self.stroke = stroke
self.fill = fill
Expand Down Expand Up @@ -268,6 +277,8 @@ def __init__(
non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
LTCurve.__init__(
self,
Expand All @@ -280,6 +291,7 @@ def __init__(
non_stroking_color,
original_path,
dashing_style,
ncs, scs,
)


Expand All @@ -300,6 +312,8 @@ def __init__(
non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
(x0, y0, x1, y1) = bbox
LTCurve.__init__(
Expand All @@ -313,6 +327,7 @@ def __init__(
non_stroking_color,
original_path,
dashing_style,
ncs, scs,
)


Expand Down Expand Up @@ -365,14 +380,16 @@ def __init__(
text: str,
textwidth: float,
textdisp: Union[float, Tuple[Optional[float], float]],
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
LTText.__init__(self)
self._text = text
self.matrix = matrix
self.fontname = font.fontname
self.ncs = ncs
self.scs = scs
self.graphicstate = graphicstate
self.adv = textwidth * fontsize * scaling
# compute the boundary rectangle.
Expand Down
Loading

0 comments on commit a0783c7

Please sign in to comment.