diff --git a/playa/converter.py b/playa/converter.py index bd35ddf2..b0e22948 100644 --- a/playa/converter.py +++ b/playa/converter.py @@ -9,6 +9,7 @@ from playa.layout import ( LAParams, + LTComponent, LTChar, LTCurve, LTFigure, @@ -21,7 +22,8 @@ from playa.pdfcolor import PDFColorSpace from playa.pdfdevice import PDFTextDevice from playa.pdffont import PDFFont, PDFUnicodeNotDefined -from playa.pdfinterp import PDFGraphicState, PDFResourceManager +from playa.pdfinterp import PDFGraphicState, PDFResourceManager, PDFStackT +from playa.psparser import PSLiteral from playa.pdfpage import PDFPage from playa.pdftypes import PDFStream from playa.utils import ( @@ -30,6 +32,7 @@ Point, Rect, apply_matrix_pt, + decode_text, mult_matrix, ) @@ -39,6 +42,8 @@ class PDFLayoutAnalyzer(PDFTextDevice): cur_item: LTLayoutContainer ctm: Matrix + cur_mcid: Optional[int] = None + cur_tag: Optional[str] = None def __init__( self, @@ -76,6 +81,24 @@ def end_figure(self, _: str) -> None: self.cur_item = self._stack.pop() self.cur_item.add(fig) + def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None: + """Handle beginning of tag, setting current MCID if any.""" + self.cur_tag = decode_text(tag.name) + if isinstance(props, dict) and "MCID" in props: + self.cur_mcid = props["MCID"] + else: + self.cur_mcid = None + + def end_tag(self) -> None: + """Handle beginning of tag, clearing current MCID.""" + self.cur_tag = None + self.cur_mcid = None + + def add_item(self, item: LTComponent) -> None: + item.mcid = self.cur_mcid + item.tag = self.cur_tag + self.cur_item.add(item) + def render_image(self, name: str, stream: PDFStream) -> None: assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) item = LTImage( @@ -83,7 +106,7 @@ def render_image(self, name: str, stream: PDFStream) -> None: stream, (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1), ) - self.cur_item.add(item) + self.add_item(item) def paint_path( self, @@ -92,6 +115,8 @@ def paint_path( fill: bool, evenodd: bool, path: Sequence[PathSegment], + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: """Paint paths described in section 4.4 of the PDF reference manual""" shape = "".join(x[0] for x in path) @@ -109,7 +134,7 @@ def paint_path( # recurse if there are multiple m's in this shape for m in re.finditer(r"m[^m]+", shape): subpath = path[m.start(0) : m.end(0)] - self.paint_path(gstate, stroke, fill, evenodd, subpath) + self.paint_path(gstate, stroke, fill, evenodd, subpath, ncs, scs) else: # Although the 'h' command does not not literally provide a @@ -153,8 +178,9 @@ def paint_path( gstate.ncolor, original_path=transformed_path, dashing_style=gstate.dash, + ncs=ncs, scs=scs ) - self.cur_item.add(line) + self.add_item(line) elif shape in {"mlllh", "mllll"}: (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts @@ -174,8 +200,9 @@ def paint_path( gstate.ncolor, transformed_path, gstate.dash, + ncs, scs ) - self.cur_item.add(rect) + self.add_item(rect) else: curve = LTCurve( gstate.linewidth, @@ -187,8 +214,9 @@ def paint_path( gstate.ncolor, transformed_path, gstate.dash, + ncs, scs ) - self.cur_item.add(curve) + self.add_item(curve) else: curve = LTCurve( gstate.linewidth, @@ -200,8 +228,9 @@ def paint_path( gstate.ncolor, transformed_path, gstate.dash, + ncs, scs ) - self.cur_item.add(curve) + self.add_item(curve) def render_char( self, @@ -211,8 +240,9 @@ def render_char( scaling: float, rise: float, cid: int, - ncs: PDFColorSpace, graphicstate: PDFGraphicState, + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> float: try: text = font.to_unichr(cid) @@ -230,10 +260,11 @@ def render_char( text, textwidth, textdisp, - ncs, graphicstate, + ncs, + scs, ) - self.cur_item.add(item) + self.add_item(item) return item.adv def handle_undefined_char(self, font: PDFFont, cid: int) -> str: diff --git a/playa/data_structures.py b/playa/data_structures.py index fab26c84..a43705b6 100644 --- a/playa/data_structures.py +++ b/playa/data_structures.py @@ -1,4 +1,4 @@ -from typing import Any, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Tuple from playa import settings from playa.pdfparser import PDFSyntaxError @@ -6,41 +6,45 @@ from playa.utils import choplist +def walk_number_tree(tree: Dict[str, Any]) -> Iterator[Tuple[int, Any]]: + stack = [tree] + while stack: + item = dict_value(stack.pop()) + if "Nums" in item: + for k, v in choplist(2, list_value(item["Nums"])): + yield int_value(k), v + if "Kids" in item: + stack.extend(reversed(list_value(item["Kids"]))) + + class NumberTree: """A PDF number tree. - See Section 3.8.6 of the PDF Reference. + See Section 7.9.7 of the PDF 1.7 Reference. """ def __init__(self, obj: Any): self._obj = dict_value(obj) - self.nums: Optional[Iterable[Any]] = None - self.kids: Optional[Iterable[Any]] = None - self.limits: Optional[Iterable[Any]] = None - if "Nums" in self._obj: - self.nums = list_value(self._obj["Nums"]) - if "Kids" in self._obj: - self.kids = list_value(self._obj["Kids"]) - if "Limits" in self._obj: - self.limits = list_value(self._obj["Limits"]) + def __iter__(self) -> Iterator[Tuple[int, Any]]: + return walk_number_tree(self._obj) - def _parse(self) -> List[Tuple[int, Any]]: - items = [] - if self.nums: # Leaf node - for k, v in choplist(2, self.nums): - items.append((int_value(k), v)) + def __contains__(self, num) -> bool: + for idx, val in self: + if idx == num: + return True + return False - if self.kids: # Root or intermediate node - for child_ref in self.kids: - items += NumberTree(child_ref)._parse() - - return items + def __getitem__(self, num) -> Any: + for idx, val in self: + if idx == num: + return val + raise IndexError(f"Number {num} not in tree") @property def values(self) -> List[Tuple[int, Any]]: - values = self._parse() - + values = list(self) + # NOTE: They are supposed to be sorted! (but, I suppose, often aren't) if settings.STRICT: if not all(a[0] <= b[0] for a, b in zip(values, values[1:])): raise PDFSyntaxError("Number tree elements are out of order") diff --git a/playa/layout.py b/playa/layout.py index 72de7e64..a3d93cf2 100644 --- a/playa/layout.py +++ b/playa/layout.py @@ -126,6 +126,11 @@ def __repr__(self) -> str: class LTItem: """Interface for things that can be analyzed""" + # Any item could be in a marked content section + mcid: Optional[int] = None + # Which could have a tag + tag: Optional[str] = None + def analyze(self, laparams: LAParams) -> None: """Perform the layout analysis.""" @@ -234,9 +239,13 @@ def __init__( non_stroking_color: Optional[Color] = None, original_path: Optional[List[PathSegment]] = None, dashing_style: Optional[Tuple[object, object]] = None, + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: LTComponent.__init__(self, get_bound(pts)) self.pts = pts + self.ncs = ncs + self.scs = scs self.linewidth = linewidth self.stroke = stroke self.fill = fill @@ -268,6 +277,8 @@ def __init__( non_stroking_color: Optional[Color] = None, original_path: Optional[List[PathSegment]] = None, dashing_style: Optional[Tuple[object, object]] = None, + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: LTCurve.__init__( self, @@ -280,6 +291,7 @@ def __init__( non_stroking_color, original_path, dashing_style, + ncs, scs, ) @@ -300,6 +312,8 @@ def __init__( non_stroking_color: Optional[Color] = None, original_path: Optional[List[PathSegment]] = None, dashing_style: Optional[Tuple[object, object]] = None, + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: (x0, y0, x1, y1) = bbox LTCurve.__init__( @@ -313,6 +327,7 @@ def __init__( non_stroking_color, original_path, dashing_style, + ncs, scs, ) @@ -365,14 +380,16 @@ def __init__( text: str, textwidth: float, textdisp: Union[float, Tuple[Optional[float], float]], - ncs: PDFColorSpace, graphicstate: PDFGraphicState, + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: LTText.__init__(self) self._text = text self.matrix = matrix self.fontname = font.fontname self.ncs = ncs + self.scs = scs self.graphicstate = graphicstate self.adv = textwidth * fontsize * scaling # compute the boundary rectangle. diff --git a/playa/pdfdevice.py b/playa/pdfdevice.py index becb6658..def0c828 100644 --- a/playa/pdfdevice.py +++ b/playa/pdfdevice.py @@ -79,6 +79,8 @@ def paint_path( fill: bool, evenodd: bool, path: Sequence[PathSegment], + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: pass @@ -89,8 +91,9 @@ def render_string( self, textstate: "PDFTextState", seq: PDFTextSeq, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState", + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: pass @@ -100,8 +103,9 @@ def render_string( self, textstate: "PDFTextState", seq: PDFTextSeq, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState", + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: assert self.ctm is not None matrix = utils.mult_matrix(textstate.matrix, self.ctm) @@ -127,8 +131,9 @@ def render_string( wordspace, rise, dxscale, - ncs, graphicstate, + ncs, + scs, ) else: textstate.linematrix = self.render_string_horizontal( @@ -142,8 +147,9 @@ def render_string( wordspace, rise, dxscale, - ncs, graphicstate, + ncs, + scs, ) def render_string_horizontal( @@ -158,8 +164,9 @@ def render_string_horizontal( wordspace: float, rise: float, dxscale: float, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState", + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> Point: (x, y) = pos needcharspace = False @@ -182,8 +189,9 @@ def render_string_horizontal( scaling, rise, cid, - ncs, graphicstate, + ncs, + scs, ) if cid == 32 and wordspace: x += wordspace @@ -202,8 +210,9 @@ def render_string_vertical( wordspace: float, rise: float, dxscale: float, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState", + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> Point: (x, y) = pos needcharspace = False @@ -226,8 +235,9 @@ def render_string_vertical( scaling, rise, cid, - ncs, graphicstate, + ncs, + scs, ) if cid == 32 and wordspace: y += wordspace @@ -242,8 +252,9 @@ def render_char( scaling: float, rise: float, cid: int, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState", + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> float: return 0 @@ -265,8 +276,9 @@ def render_string( self, textstate: "PDFTextState", seq: PDFTextSeq, - ncs: PDFColorSpace, graphicstate: "PDFGraphicState", + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, ) -> None: font = textstate.font assert font is not None diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py index 17e83566..5f592286 100644 --- a/playa/pdfdocument.py +++ b/playa/pdfdocument.py @@ -1116,35 +1116,43 @@ def read_xref_from( class PageLabels(NumberTree): """PageLabels from the document catalog. - See Section 8.3.1 in the PDF Reference. + See Section 12.4.2 in the PDF 1.7 Reference. """ @property def labels(self) -> Iterator[str]: - ranges = self.values - - # The tree must begin with page index 0 - if len(ranges) == 0 or ranges[0][0] != 0: + itor = iter(self) + try: + start, label_dict_unchecked = next(itor) + # The tree must begin with page index 0 + if start != 0: + if settings.STRICT: + raise PDFSyntaxError("PageLabels is missing page index 0") + else: + # Try to cope, by assuming empty labels for the initial pages + start = 0 + except StopIteration: if settings.STRICT: - raise PDFSyntaxError("PageLabels is missing page index 0") - else: - # Try to cope, by assuming empty labels for the initial pages - ranges.insert(0, (0, {})) + raise PDFSyntaxError("PageLabels is empty") + start = 0 + label_dict_unchecked = {} - for next, (start, label_dict_unchecked) in enumerate(ranges, 1): + while True: # forever! label_dict = dict_value(label_dict_unchecked) style = label_dict.get("S") prefix = decode_text(str_value(label_dict.get("P", b""))) first_value = int_value(label_dict.get("St", 1)) - if next == len(ranges): + try: + next_start, label_dict_unchecked = next(itor) + except StopIteration: # This is the last specified range. It continues until the end # of the document. values: Iterable[int] = itertools.count(first_value) else: - end, _ = ranges[next] - range_length = end - start + range_length = next_start - start values = range(first_value, first_value + range_length) + start = next_start for value in values: label = self._format_page_label(value, style) diff --git a/playa/pdfinterp.py b/playa/pdfinterp.py index 52b19932..78371869 100644 --- a/playa/pdfinterp.py +++ b/playa/pdfinterp.py @@ -553,7 +553,7 @@ def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None: def do_S(self) -> None: """Stroke path""" - self.device.paint_path(self.graphicstate, True, False, False, self.curpath) + self.device.paint_path(self.graphicstate, True, False, False, self.curpath, self.ncs, self.scs) self.curpath = [] def do_s(self) -> None: @@ -563,7 +563,7 @@ def do_s(self) -> None: def do_f(self) -> None: """Fill path using nonzero winding number rule""" - self.device.paint_path(self.graphicstate, False, True, False, self.curpath) + self.device.paint_path(self.graphicstate, False, True, False, self.curpath, self.ncs, self.scs) self.curpath = [] def do_F(self) -> None: @@ -571,17 +571,17 @@ def do_F(self) -> None: def do_f_a(self) -> None: """Fill path using even-odd rule""" - self.device.paint_path(self.graphicstate, False, True, True, self.curpath) + self.device.paint_path(self.graphicstate, False, True, True, self.curpath, self.ncs, self.scs) self.curpath = [] def do_B(self) -> None: """Fill and stroke path using nonzero winding number rule""" - self.device.paint_path(self.graphicstate, True, True, False, self.curpath) + self.device.paint_path(self.graphicstate, True, True, False, self.curpath, self.ncs, self.scs) self.curpath = [] def do_B_a(self) -> None: """Fill and stroke path using even-odd rule""" - self.device.paint_path(self.graphicstate, True, True, True, self.curpath) + self.device.paint_path(self.graphicstate, True, True, True, self.curpath, self.ncs, self.scs) self.curpath = [] def do_b(self) -> None: @@ -865,12 +865,15 @@ def do_TJ(self, seq: PDFStackT) -> None: if settings.STRICT: raise PDFInterpreterError("No font specified!") return + # FIXME: Are we sure? assert self.ncs is not None + assert self.scs is not None self.device.render_string( self.textstate, cast(PDFTextSeq, seq), - self.ncs, self.graphicstate.copy(), + self.ncs, + self.scs, ) def do_Tj(self, s: PDFStackT) -> None: diff --git a/playa/pdfstructtree.py b/playa/pdfstructtree.py index 88352e24..7d3c32bd 100644 --- a/playa/pdfstructtree.py +++ b/playa/pdfstructtree.py @@ -144,60 +144,68 @@ class PDFStructTree(Findable): """Parse the structure tree of a PDF. This class creates a representation of the portion of the - structure tree that reaches marked content sections, either for a - single page, or for the whole document. Note that this is slightly - different from the behaviour of other PDF libraries which will - also include structure elements with no content. + structure tree that reaches marked content sections for a document + or a subset of its pages. Note that this is slightly different + from the behaviour of other PDF libraries which will also include + structure elements with no content. If the PDF has no structure, the constructor will raise `PDFNoStructTree`. + Args: + doc: Document from which to extract structure tree + pages: List of (number, page) pairs - numbers will be used to + identify pages in the tree through the `page_number` + attribute of `PDFStructElement`. """ page: Union[PDFPage, None] - def __init__(self, doc: "PDFDocument", page: Union[PDFPage, None] = None): + def __init__( + self, + doc: "PDFDocument", + pages: Union[Iterable[Tuple[Union[int, None], PDFPage]], None] = None, + ): if "StructTreeRoot" not in doc.catalog: raise PDFNoStructTree("Catalog has no 'StructTreeRoot' entry") self.root = resolve1(doc.catalog["StructTreeRoot"]) self.role_map = resolve1(self.root.get("RoleMap", {})) self.class_map = resolve1(self.root.get("ClassMap", {})) self.children: List[PDFStructElement] = [] + self.page_dict: Dict[Any, Union[int, None]] - # If we have a specific page then we will work backwards from - # its ParentTree - this is because structure elements could - # span multiple pages, and the "Pg" attribute is *optional*, - # so this is the approved way to get a page's structure... - if page is not None: - self.page = page - self.page_dict = None - # ...EXCEPT that the ParentTree is sometimes missing, in which - # case we fall back to the non-approved way. + if pages is None: + self.page_dict = { + page.pageid: idx + 1 for idx, page in enumerate(doc.get_pages()) + } + self._parse_struct_tree() + else: + pagelist = list(pages) + self.page_dict = { + page.pageid: page_number for page_number, page in pagelist + } parent_tree_obj = self.root.get("ParentTree") - if parent_tree_obj is None: - self._parse_struct_tree() - else: + # If we have a single page then we will work backwards from + # its ParentTree - this is because structure elements could + # span multiple pages, and the "Pg" attribute is *optional*, + # so this is the approved way to get a page's structure... + if len(pagelist) == 1 and parent_tree_obj is not None: + _, page = pagelist[0] parent_tree = NumberTree(parent_tree_obj) # If there is no marked content in the structure tree for # this page (which can happen even when there is a # structure tree) then there is no `StructParents`. # Note however that if there are XObjects in a page, # *they* may have `StructParent` (not `StructParents`) - if "StructParents" not in self.page.attrs: + if "StructParents" not in page.attrs: return - parent_id = self.page.attrs["StructParents"] - # NumberTree should have a `get` method like it does in pdf.js... - parent_array = resolve1( - next(array for num, array in parent_tree.values if num == parent_id) - ) + parent_id = page.attrs["StructParents"] + parent_array = resolve1(parent_tree[parent_id]) self._parse_parent_tree(parent_array) - else: - self.page = None - # Overhead of creating pages shouldn't be too bad we hope! - self.page_dict = { - page.pageid: idx + 1 for idx, page in enumerate(doc.get_pages()) - } - self._parse_struct_tree() + else: + # ...EXCEPT that the ParentTree is sometimes missing, in which + # case we fall back to the non-approved way. + self._parse_struct_tree() def _make_attributes( self, obj: Dict[str, Any], revision: Union[int, None] @@ -327,13 +335,7 @@ def on_parsed_page(self, obj: Dict[str, Any]) -> bool: if "Pg" not in obj: return True page_objid = obj["Pg"].objid - if self.page_dict is not None: - return page_objid in self.page_dict - if self.page is not None: - # We have to do this to satisfy mypy - if page_objid != self.page.pageid: - return False - return True + return page_objid in self.page_dict def _parse_struct_tree(self) -> None: """Populate the structure tree starting from the root, skipping diff --git a/playa/utils.py b/playa/utils.py index a35b58cf..ab536066 100644 --- a/playa/utils.py +++ b/playa/utils.py @@ -630,13 +630,15 @@ def nunpack(s: bytes, default: int = 0) -> int: ) -def decode_text(s: bytes) -> str: +def decode_text(s: Union[str, bytes]) -> str: """Decodes a PDFDocEncoding string to Unicode.""" if isinstance(s, bytes) and s.startswith(b"\xfe\xff"): return str(s[2:], "utf-16be", "ignore") try: - ords = (ord(c) if isinstance(c, str) else c for c in s) - return "".join(PDFDocEncoding[o] for o in ords) + if isinstance(s, str): + return "".join(PDFDocEncoding[ord(c)] for c in s) + else: + return "".join(PDFDocEncoding[c] for c in s) except IndexError: return str(s) diff --git a/tests/test_pdfstructtree.py b/tests/test_pdfstructtree.py index 968427fd..1f3ff650 100644 --- a/tests/test_pdfstructtree.py +++ b/tests/test_pdfstructtree.py @@ -13,7 +13,7 @@ class TestClass(unittest.TestCase): def test_structure_tree_class(self): with playa.open(TESTDIR / "image_structure.pdf") as pdf: - stree = PDFStructTree(pdf, next(pdf.get_pages())) + stree = PDFStructTree(pdf, [(1, next(pdf.get_pages()))]) doc_elem = next(iter(stree)) assert [k.type for k in doc_elem] == ["P", "P", "Figure"] @@ -22,7 +22,7 @@ def test_find_all_tree(self): Test find_all() and find() on trees """ with playa.open(TESTDIR / "image_structure.pdf") as pdf: - stree = PDFStructTree(pdf, next(pdf.get_pages())) + stree = PDFStructTree(pdf, [(1, next(pdf.get_pages()))]) figs = list(stree.find_all("Figure")) assert len(figs) == 1 fig = stree.find("Figure") @@ -68,13 +68,11 @@ def test_all_mcids(self): assert 1 in pages assert 2 in pages - page = list(pdf.get_pages())[1] - stree = PDFStructTree(pdf, page) + pages = list(pdf.get_pages()) + stree = PDFStructTree(pdf, [(2, pages[1])]) sect = next(stree.find_all("Sect")) mcids = list(sect.all_mcids()) pages = set(page for page, mcid in mcids) - assert None in pages - assert 1 not in pages - assert 2 not in pages + assert pages == {2} for p in sect.find_all("P"): assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids) diff --git a/tests/test_pdftypes.py b/tests/test_pdftypes.py new file mode 100644 index 00000000..9a4393bb --- /dev/null +++ b/tests/test_pdftypes.py @@ -0,0 +1,41 @@ +""" +Test PDF types and data structures. +""" + +from playa.data_structures import NumberTree + + +NUMTREE1 = { + "Kids": [ + {"Nums": [1, "a", 3, "b", 7, "c"], "Limits": [1, 7]}, + { + "Kids": [ + {"Nums": [8, 123, 9, {"x": "y"}, 10, "forty-two"], "Limits": [8, 10]}, + {"Nums": [11, "zzz", 12, "xxx", 15, "yyy"], "Limits": [11, 15]}, + ], + "Limits": [8, 15], + }, + {"Nums": [20, 456], "Limits": [20, 20]}, + ] +} + + +def test_number_tree(): + """Test NumberTrees.""" + nt = NumberTree(NUMTREE1) + assert 15 in nt + assert 20 in nt + assert nt[20] == 456 + assert nt[9] == {"x": "y"} + assert list(nt) == [ + (1, "a"), + (3, "b"), + (7, "c"), + (8, 123), + (9, {"x": "y"}), + (10, "forty-two"), + (11, "zzz"), + (12, "xxx"), + (15, "yyy"), + (20, 456), + ]