Skip to content

Commit

Permalink
fix: correct handling of bfrange (oops)
Browse files Browse the repository at this point in the history
for whatever reason mypy did not catch this!
  • Loading branch information
dhdaines committed Nov 28, 2024
1 parent fc7eea9 commit f1fa25b
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 7 deletions.
12 changes: 7 additions & 5 deletions playa/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def add_cid2lit(self, cid: int, name: PSLiteral) -> None:

def add_cid2unichr(self, cid: int, unichr: str) -> None:
# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
assert isinstance(unichr, str)
if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
return
self.cid2unichr[cid] = unichr
Expand Down Expand Up @@ -311,7 +312,8 @@ def add_bf_range(
"offsets does not match the code length.",
)
for cid, unicode_value in zip(range(start, end + 1), code):
cmap.add_cid2unichr(cid, unicode_value)
assert isinstance(unicode_value, bytes)
cmap.add_cid2bytes(cid, unicode_value)
else:
assert isinstance(code, bytes)
var = code[-4:]
Expand Down Expand Up @@ -380,18 +382,18 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
for start_byte, end_byte, cid in choplist(3, stack):
if not isinstance(start_byte, bytes):
log.warning("The start object of begincidrange is not a byte.")
return
return cmap
if not isinstance(end_byte, bytes):
log.warning("The end object of begincidrange is not a byte.")
return
return cmap
if not isinstance(cid, int):
log.warning("The cid object of begincidrange is not a byte.")
return
return cmap
if len(start_byte) != len(end_byte):
log.warning(
"The start and end byte of begincidrange have different lengths.",
)
return
return cmap
add_cid_range(cmap, start_byte, end_byte, cid)
del stack[:]
elif obj is KEYWORD_BEGINCIDCHAR:
Expand Down
2 changes: 1 addition & 1 deletion playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1054,7 +1054,7 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
subtype = "Type1"
if subtype in ("Type1", "MMType1"):
# Type1 Font
font = Type1Font(spec)
font: Font = Type1Font(spec)
elif subtype == "TrueType":
# TrueType Font
font = PDFTrueTypeFont(spec)
Expand Down
13 changes: 12 additions & 1 deletion tests/test_cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
1 begincodespacerange
<00> <FF>
endcodespacerange
1 beginbfrange
<006F> <0072> [<00E7> <00E9> <00E8> <00EA>]
endbfrange
3 beginbfchar
<01> <0078>
<02> <030C>
Expand All @@ -33,7 +36,15 @@

def test_cmap_parser():
cmap = parse_tounicode(STREAMDATA)
assert cmap.cid2unichr == {1: "x", 2: "̌", 3: "u"}
assert cmap.cid2unichr == {
1: "x",
2: "̌",
3: "u",
111: "ç",
112: "é",
113: "è",
114: "ê",
}


# Basically the sort of stuff we try to find in a Type 1 font
Expand Down

0 comments on commit f1fa25b

Please sign in to comment.