diff --git a/playa/cli.py b/playa/cli.py index d5ba69ca..21da3ad9 100644 --- a/playa/cli.py +++ b/playa/cli.py @@ -3,6 +3,7 @@ """ import argparse +import logging import csv from pathlib import Path @@ -26,12 +27,18 @@ def make_argparse() -> argparse.ArgumentParser: choices=["screen", "page", "user"], default="screen", ) + parser.add_argument( + "--debug", + help="Very verbose debugging output", + action="store_true", + ) return parser def main() -> None: parser = make_argparse() args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.debug else logging.WARNING) writer = csv.DictWriter(args.outfile, fieldnames=playa.fieldnames) writer.writeheader() for path in args.pdfs: diff --git a/playa/page.py b/playa/page.py index 5a0473a7..61b71525 100644 --- a/playa/page.py +++ b/playa/page.py @@ -1640,6 +1640,7 @@ def render_string( if isinstance(obj, str): obj = make_compat_bytes(obj) if not isinstance(obj, bytes): + log.warning("Found non-string %r in text object", obj) continue for cid in self.textstate.font.decode(obj): if needcharspace: @@ -2333,7 +2334,8 @@ def do_TJ(self, strings: PDFObject) -> None: positioning""" args = list_value(strings) if not all(isinstance(s, (int, float, bytes)) for s in args): - raise TypeError("TJ takes only strings and numbers, not %r" % args) + log.warning("Found non-string in text object %r", args) + return self.textobj.append(make_txt("TJ", *args)) def do_Tj(self, s: PDFObject) -> None: diff --git a/samples/contrib/issue-1059-cmap-decode.pdf b/samples/contrib/issue-1059-cmap-decode.pdf new file mode 100644 index 00000000..452973df Binary files /dev/null and b/samples/contrib/issue-1059-cmap-decode.pdf differ