diff --git a/playa/cmapdb.py b/playa/cmapdb.py index 89f37360..71dcf88c 100644 --- a/playa/cmapdb.py +++ b/playa/cmapdb.py @@ -273,6 +273,7 @@ def add_cid2lit(self, cid: int, name: PSLiteral) -> None: def add_cid2unichr(self, cid: int, unichr: str) -> None: # A0 = non-breaking space, some weird fonts can have a collision on a cid here. + assert isinstance(unichr, str) if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": return self.cid2unichr[cid] = unichr @@ -311,7 +312,8 @@ def add_bf_range( "offsets does not match the code length.", ) for cid, unicode_value in zip(range(start, end + 1), code): - cmap.add_cid2unichr(cid, unicode_value) + assert isinstance(unicode_value, bytes) + cmap.add_cid2bytes(cid, unicode_value) else: assert isinstance(code, bytes) var = code[-4:] @@ -380,18 +382,18 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap: for start_byte, end_byte, cid in choplist(3, stack): if not isinstance(start_byte, bytes): log.warning("The start object of begincidrange is not a byte.") - return + return cmap if not isinstance(end_byte, bytes): log.warning("The end object of begincidrange is not a byte.") - return + return cmap if not isinstance(cid, int): log.warning("The cid object of begincidrange is not a byte.") - return + return cmap if len(start_byte) != len(end_byte): log.warning( "The start and end byte of begincidrange have different lengths.", ) - return + return cmap add_cid_range(cmap, start_byte, end_byte, cid) del stack[:] elif obj is KEYWORD_BEGINCIDCHAR: diff --git a/playa/document.py b/playa/document.py index 293da107..bf730c19 100644 --- a/playa/document.py +++ b/playa/document.py @@ -1054,7 +1054,7 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> Font: subtype = "Type1" if subtype in ("Type1", "MMType1"): # Type1 Font - font = Type1Font(spec) + font: Font = Type1Font(spec) elif subtype == "TrueType": # TrueType Font font = PDFTrueTypeFont(spec) diff --git a/tests/test_cmapdb.py b/tests/test_cmapdb.py index ff624504..d5165a88 100644 --- a/tests/test_cmapdb.py +++ b/tests/test_cmapdb.py @@ -19,6 +19,9 @@ 1 begincodespacerange <00> endcodespacerange +1 beginbfrange +<006F> <0072> [<00E7> <00E9> <00E8> <00EA>] +endbfrange 3 beginbfchar <01> <0078> <02> <030C> @@ -33,7 +36,15 @@ def test_cmap_parser(): cmap = parse_tounicode(STREAMDATA) - assert cmap.cid2unichr == {1: "x", 2: "̌", 3: "u"} + assert cmap.cid2unichr == { + 1: "x", + 2: "̌", + 3: "u", + 111: "ç", + 112: "é", + 113: "è", + 114: "ê", + } # Basically the sort of stuff we try to find in a Type 1 font