fix: correct handling of bfrange (oops)

for whatever reason mypy did not catch this!
dhdaines · Nov 28, 2024 · f1fa25b · f1fa25b
1 parent fc7eea9
commit f1fa25b
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 7 deletions.
diff --git a/playa/cmapdb.py b/playa/cmapdb.py
@@ -273,6 +273,7 @@ def add_cid2lit(self, cid: int, name: PSLiteral) -> None:
 
     def add_cid2unichr(self, cid: int, unichr: str) -> None:
         # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
+        assert isinstance(unichr, str)
         if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
             return
         self.cid2unichr[cid] = unichr
@@ -311,7 +312,8 @@ def add_bf_range(
                 "offsets does not match the code length.",
             )
         for cid, unicode_value in zip(range(start, end + 1), code):
-            cmap.add_cid2unichr(cid, unicode_value)
+            assert isinstance(unicode_value, bytes)
+            cmap.add_cid2bytes(cid, unicode_value)
     else:
         assert isinstance(code, bytes)
         var = code[-4:]
@@ -380,18 +382,18 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
             for start_byte, end_byte, cid in choplist(3, stack):
                 if not isinstance(start_byte, bytes):
                     log.warning("The start object of begincidrange is not a byte.")
-                    return
+                    return cmap
                 if not isinstance(end_byte, bytes):
                     log.warning("The end object of begincidrange is not a byte.")
-                    return
+                    return cmap
                 if not isinstance(cid, int):
                     log.warning("The cid object of begincidrange is not a byte.")
-                    return
+                    return cmap
                 if len(start_byte) != len(end_byte):
                     log.warning(
                         "The start and end byte of begincidrange have different lengths.",
                     )
-                    return
+                    return cmap
                 add_cid_range(cmap, start_byte, end_byte, cid)
             del stack[:]
         elif obj is KEYWORD_BEGINCIDCHAR:

diff --git a/playa/document.py b/playa/document.py
@@ -1054,7 +1054,7 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
             subtype = "Type1"
         if subtype in ("Type1", "MMType1"):
             # Type1 Font
-            font = Type1Font(spec)
+            font: Font = Type1Font(spec)
         elif subtype == "TrueType":
             # TrueType Font
             font = PDFTrueTypeFont(spec)

diff --git a/tests/test_cmapdb.py b/tests/test_cmapdb.py
@@ -19,6 +19,9 @@
 1 begincodespacerange
 <00> <FF>
 endcodespacerange
+1 beginbfrange
+<006F> <0072> [<00E7> <00E9> <00E8> <00EA>]
+endbfrange
 3 beginbfchar
 <01> <0078>
 <02> <030C>
@@ -33,7 +36,15 @@
 
 def test_cmap_parser():
     cmap = parse_tounicode(STREAMDATA)
-    assert cmap.cid2unichr == {1: "x", 2: "̌", 3: "u"}
+    assert cmap.cid2unichr == {
+        1: "x",
+        2: "̌",
+        3: "u",
+        111: "ç",
+        112: "é",
+        113: "è",
+        114: "ê",
+    }
 
 
 # Basically the sort of stuff we try to find in a Type 1 font