From 2c468dc8a2f8a13f34b7a02067fcd1d46dba6cb1 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 19 Sep 2024 10:02:08 -0400 Subject: [PATCH] fix: handle some invalid PDF cases --- playa/pdfparser.py | 7 ++++++- playa/utils.py | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/playa/pdfparser.py b/playa/pdfparser.py index efd2a024..41b327f4 100644 --- a/playa/pdfparser.py +++ b/playa/pdfparser.py @@ -142,7 +142,12 @@ def flush(self) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is KEYWORD_R: # reference to indirect object - (_, _object_id), _ = self.pop(2) + try: + (_, _object_id), _ = self.pop(2) + except ValueError: + raise PDFSyntaxError( + "Expected generation and object id in indirect object reference" + ) object_id = safe_int(_object_id) if object_id is not None: obj = PDFObjRef(self.doc, object_id) diff --git a/playa/utils.py b/playa/utils.py index 418887eb..cc0f5a28 100644 --- a/playa/utils.py +++ b/playa/utils.py @@ -23,7 +23,7 @@ cast, ) -from playa.exceptions import PDFTypeError, PDFValueError +from playa.exceptions import PDFSyntaxError, PDFTypeError, PDFValueError if TYPE_CHECKING: from playa.layout import LTComponent @@ -246,6 +246,8 @@ def parse_rect(o: Any) -> Rect: return float(x0), float(y0), float(x1), float(y1) except ValueError: raise PDFValueError("Could not parse rectangle") + except TypeError: + raise PDFSyntaxError("Rectangle contains non-numeric values") def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix: