From 3ef10b1ea817033aef3cf0e1aad21561b15d2475 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 27 Nov 2024 11:28:41 -0500 Subject: [PATCH] Verify that we don't have pdfminer.six#1059 (and warn about it) (#23) * test: verify that we don not have pdfminer.six#1059 (and warn about it) * fix: warn rather than throwing TypeError on bogus text objects --- playa/cli.py | 7 +++++++ playa/page.py | 4 +++- samples/contrib/issue-1059-cmap-decode.pdf | Bin 0 -> 3785 bytes 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 samples/contrib/issue-1059-cmap-decode.pdf diff --git a/playa/cli.py b/playa/cli.py index d5ba69ca..21da3ad9 100644 --- a/playa/cli.py +++ b/playa/cli.py @@ -3,6 +3,7 @@ """ import argparse +import logging import csv from pathlib import Path @@ -26,12 +27,18 @@ def make_argparse() -> argparse.ArgumentParser: choices=["screen", "page", "user"], default="screen", ) + parser.add_argument( + "--debug", + help="Very verbose debugging output", + action="store_true", + ) return parser def main() -> None: parser = make_argparse() args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.debug else logging.WARNING) writer = csv.DictWriter(args.outfile, fieldnames=playa.fieldnames) writer.writeheader() for path in args.pdfs: diff --git a/playa/page.py b/playa/page.py index 5a0473a7..61b71525 100644 --- a/playa/page.py +++ b/playa/page.py @@ -1640,6 +1640,7 @@ def render_string( if isinstance(obj, str): obj = make_compat_bytes(obj) if not isinstance(obj, bytes): + log.warning("Found non-string %r in text object", obj) continue for cid in self.textstate.font.decode(obj): if needcharspace: @@ -2333,7 +2334,8 @@ def do_TJ(self, strings: PDFObject) -> None: positioning""" args = list_value(strings) if not all(isinstance(s, (int, float, bytes)) for s in args): - raise TypeError("TJ takes only strings and numbers, not %r" % args) + log.warning("Found non-string in text object %r", args) + return self.textobj.append(make_txt("TJ", *args)) def do_Tj(self, s: PDFObject) -> None: diff --git a/samples/contrib/issue-1059-cmap-decode.pdf b/samples/contrib/issue-1059-cmap-decode.pdf new file mode 100644 index 0000000000000000000000000000000000000000..452973df63a8c203031ad9106e24c70172eba900 GIT binary patch literal 3785 zcmb7H*>c-96n*Dc;KdU;9gzTcW;~j)Eh$l3*N!AN4<0|z5+t#eNQH!E;{I3r0r|4t zi;c9L#pc))4|or7a5s`CH^XtCxruY~_1~}mISwO~R1beSXJ_6`zUB?_VPNXbANSn5 z%tgMf)`Nk=%Ox5)cvD+nR)_lhN_>2UC#<&=V#q9IJJa#>gVi|WfA z1(O)YlFYyNuK2RZpN-z+H88chDQ~Jnz2FU~9#^HnA4V8XGabO;O)HTLPH3yBBPi5@ zdXptgS%}9|yT8cXI*0x8c0AVOcm0UQr0>Tm!HX>N?=3gcHgLm^rH02SjX)rrkOvp~ zq)~!HiMf{|QY*kkhA7kbV2_N#N%0tN8crJdhL?{YGnl!YMPOGOUYnBmV5;hUFG{f@*O9hr1edl z`oL)|FhwCBkinFgMYZJ3K40)UFV~!$Q8)(VYz)VMKnfq-&lKxnwb z{0T-Qg?xoz3cMFcG{$)Wg$hXm4I+gkfd;Wcl0bt*AxWS?s*ohmAd~8}4%CV6v`$zY zYD|6UbLkq~a#3;{mVLFy-N}Eihx&ryiLs)E+D z|G0Z@kUd4T&ZF0{R5O;+#r|Nz>ih1o~I8I4{>*>;*GO%|8j2sqSyM z-&O#~oE;v7Y<$W)m|?zZA<}7|2-Y)nn0s$jLNY-f+uig(ST@q2^HjHGff+}z=sGKQ zIuVqv?TzOLv8n3bWOUo``qaCGf)qtn4xx|rhVOjpQ}~N$<|mZ(Y5IoJH>ciJwfwOG z)ba8F1<|`I7IoEBEAfupKmiIeuLwK}PQ7^{w!C*yRe!V7uCz|8*R6u3K5Ki$XXmry zCKzo>`sxDOH(Bf)=ErUnlO%BclmHAu7rL2(TLUxdrYQlaj1Am4pWZs9gOgHHubqpz zBU^ju#+lE^d?g=Pup)Fm#Y|(IXufmqyo>Aix3haP|J51IosZwwmql*S>I2u}3`U=Z zMi5iR2EIn0Vz&BJPx-k%KMq~i(U&k6o;OyXf<}-wHt-etH1nyx@!;qBUJ)EdQ#Vx6 zf!#yb?1wCYbbyB)VumKm|`e8 zc!@|Adn-~Wl@)2Q6{+nFevL>?eJV11fk>SItw@zwtw@cHC}VH%D?}RVQ<2dNM6yn$ zvLL~hVk$wyjJZ?>8Z6EA#_?(B8q*!1xZ8s?G}g-2pp6s5G9u3Ifg)?J zj4j->eRlQs2MyDD0UW7jlpZnH#v{sC8i5xjR$fIS>NGyq$ajO*wV6hii3CeT0Paw; zXy@n^G%`HKd^ZjyQ|(@1Bx@`?hmELeBwNSwL6hAmlL8IX248ONyMFxWeB(9jOU+l{ zZx%TCTB;5bM-e2$ir6ve`|=hk+p!oP#dvha7=yR2qcJ~%7meew^!a=Nz7QVS1>raB z;^$-0bFTaxIhv1!@T@)_i;~WKj;Qlu%WM4ZoE86Y@QWuu8{{hyyiQgXd|bji0K9F% U*GXOr>AxThGw0-FbUk+d2iZM|8UO$Q literal 0 HcmV?d00001