Skip to content

Commit

Permalink
feat: Remove excessive debug logging for significant speedup
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Dec 31, 2024
1 parent 5b20c3f commit 029cb0b
Show file tree
Hide file tree
Showing 5 changed files with 3 additions and 63 deletions.
12 changes: 2 additions & 10 deletions playa/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
copy(self.code2cid, cmap.code2cid)

def decode(self, code: bytes) -> Iterator[int]:
log.debug("decode: %r, %r", self, code)
d = self.code2cid
for i in iter(code):
if i in d:
Expand Down Expand Up @@ -148,7 +147,6 @@ def __repr__(self) -> str:
return "<UnicodeMap: %s>" % self.attrs.get("CMapName")

def get_unichr(self, cid: int) -> str:
log.debug("get_unichr: %r, %r", self, cid)
return self.cid2unichr[cid]

def dump(self, out: TextIO = sys.stdout) -> None:
Expand All @@ -159,7 +157,6 @@ def dump(self, out: TextIO = sys.stdout) -> None:
class IdentityUnicodeMap(UnicodeMap):
def get_unichr(self, cid: int) -> str:
"""Interpret character id as unicode codepoint"""
log.debug("get_unichr: %r, %r", self, cid)
return chr(cid)


Expand Down Expand Up @@ -189,7 +186,6 @@ class CMapDB:
def _load_data(cls, name: str) -> Any:
name = name.replace("\0", "")
filename = "%s.pickle.gz" % name
log.debug("loading: %r", name)
cmap_paths = (
os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
os.path.join(os.path.dirname(__file__), "cmap"),
Expand Down Expand Up @@ -310,11 +306,10 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
while True:
try:
pos, obj = next(parser)
except PDFSyntaxError as e:
except PDFSyntaxError:
# CMap syntax is apparently not PDF syntax (e.g. "def"
# seems to occur within dictionaries, for no apparent
# reason, perhaps a PostScript thing?)
log.debug("Ignoring syntax error: %s", e)
parser.reset()
continue
except StopIteration:
Expand All @@ -323,7 +318,6 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
if not isinstance(obj, PSKeyword):
stack.append(obj)
continue
log.debug("keyword: %r (%r)", obj, stack)
# Ignore everything outside begincmap / endcmap
if obj is KEYWORD_BEGINCMAP:
in_cmap = True
Expand Down Expand Up @@ -487,8 +481,7 @@ def parse_encoding(data: bytes) -> EncodingCMap:
while True:
try:
pos, obj = next(parser)
except PDFSyntaxError as e:
log.debug("Ignoring syntax error: %s", e)
except PDFSyntaxError:
parser.reset()
continue
except StopIteration:
Expand All @@ -497,7 +490,6 @@ def parse_encoding(data: bytes) -> EncodingCMap:
if not isinstance(obj, PSKeyword):
stack.append(obj)
continue
log.debug("keyword: %r (%r)", obj, stack)

if obj is KEYWORD_DEF:
try:
Expand Down
22 changes: 0 additions & 22 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def _load(self, parser: ObjectParser) -> None:
if use_b != b"n":
continue
self.offsets[objid] = XRefPos(None, int(pos_b), int(genno_b))
log.debug("xref objects: %r", self.offsets)
self._load_trailer(parser)

def _load_trailer(self, parser: ObjectParser) -> None:
Expand All @@ -180,7 +179,6 @@ def _load_trailer(self, parser: ObjectParser) -> None:
)
(_, dic) = next(parser)
self.trailer.update(dict_value(dic))
log.debug("trailer=%r", self.trailer)

def __repr__(self) -> str:
return "<XRefTable: offsets=%r>" % (self.offsets.keys())
Expand Down Expand Up @@ -245,7 +243,6 @@ def _load(self, parser: IndirectObjectParser) -> None:
if token is KEYWORD_TRAILER:
_, dic = s2
self.trailer.update(dict_value(dic))
log.debug("trailer=%r", self.trailer)
return
s1 = s2
# If not, then try harder
Expand All @@ -258,7 +255,6 @@ def _load(self, parser: IndirectObjectParser) -> None:
if not isinstance(trailer, dict):
break
self.trailer.update(trailer)
log.debug("trailer=%r", self.trailer)
return
log.warning("b'trailer' not found in document or invalid")

Expand Down Expand Up @@ -303,13 +299,6 @@ def _load(self, parser: IndirectObjectParser) -> None:
self.data = stream.buffer
self.entlen = self.fl1 + self.fl2 + self.fl3
self.trailer = stream.attrs
log.debug(
"xref stream: objid=%s, fields=%d,%d,%d",
", ".join(map(repr, self.ranges)),
self.fl1,
self.fl2,
self.fl3,
)

@property
def objids(self) -> Iterator[int]:
Expand Down Expand Up @@ -1027,7 +1016,6 @@ def _get_objects(self, stream: ContentStream) -> Tuple[List[PDFObject], int]:

def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
assert self.parser is not None
log.debug("getobj_parse: seeking to %d for objid %d", pos, objid)
self.parser.seek(pos)
try:
_, obj = next(self.parser)
Expand All @@ -1053,7 +1041,6 @@ def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
raise PDFSyntaxError(
f"Indirect object {objid!r} not found in document"
) from e
log.debug("found object (%r) seeking to %r", m.group(0), realpos)
self.parser.seek(realpos)
(_, obj) = next(self.parser)
if obj.objid != objid:
Expand All @@ -1079,14 +1066,12 @@ def __getitem__(self, objid: int) -> PDFObject:
if not self.xrefs:
raise ValueError("Document is not initialized")
if objid not in self._cached_objs:
log.debug("getobj: objid=%r", objid)
obj = None
for xref in self.xrefs:
try:
(strmid, index, genno) = xref.get_pos(objid)
except KeyError:
continue
log.debug("getobj: strmid %r index %r genno %r", strmid, index, genno)
try:
if strmid is not None:
stream = stream_value(self[strmid])
Expand All @@ -1103,14 +1088,12 @@ def __getitem__(self, objid: int) -> PDFObject:
continue
if obj is None:
raise IndexError(f"Object with ID {objid} not found")
log.debug("register: objid=%r: %r", objid, obj)
self._cached_objs[objid] = obj
return self._cached_objs[objid]

def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
if objid and objid in self._cached_fonts:
return self._cached_fonts[objid]
log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
if spec.get("Type") is not LITERAL_FONT:
log.warning("Font specification Type is not /Font: %r", spec)
# Create a Font object.
Expand Down Expand Up @@ -1258,11 +1241,9 @@ def _get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
log.warning("Page has no Type, trying type: %r", object_properties)
object_type = object_properties.get("type")
if object_type is LITERAL_PAGES and "Kids" in object_properties:
log.debug("Pages: Kids=%r", object_properties["Kids"])
for child in reversed(list_value(object_properties["Kids"])):
stack.append((child, object_properties))
elif object_type is LITERAL_PAGE:
log.debug("Page: %r", object_properties)
yield object_id, object_properties

@property
Expand Down Expand Up @@ -1320,9 +1301,7 @@ def _find_xref(self) -> int:
prev = b""
for pos, line in reverse_iter_lines(self.buffer):
line = line.strip()
log.debug("find_xref: %r", line)
if line == b"startxref":
log.debug("xref found: pos=%r", prev)
if not prev.isdigit():
log.warning("Invalid startxref position: %r", prev)
continue
Expand Down Expand Up @@ -1355,7 +1334,6 @@ def _read_xref_from(
(pos, token) = parser.nexttoken()
except StopIteration:
raise ValueError("Unexpected EOF at {start}")
log.debug("read_xref_from: start=%d, token=%r", start, token)
if token is KEYWORD_XREF:
parser.nextline()
xref: XRef = XRefTable(parser)
Expand Down
2 changes: 1 addition & 1 deletion playa/encodingdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,6 @@ def get_encoding(
try:
cid2unicode[cid] = name2unicode(cast(str, x.name))
except (KeyError, ValueError) as e:
log.debug(str(e))
log.debug("Failed to get char %r: %s", x, e)
cid += 1
return cid2unicode
22 changes: 0 additions & 22 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,10 +663,8 @@ def __init__(self, streams: Iterable[PDFObject]) -> None:
self.streamiter = iter(streams)
try:
stream = stream_value(next(self.streamiter))
log.debug("ContentParser starting stream %r", stream)
super().__init__(stream.buffer)
except StopIteration:
log.debug("ContentParser has no content, returning nothing")
super().__init__(b"")

def nexttoken(self) -> Tuple[int, Token]:
Expand All @@ -682,7 +680,6 @@ def nexttoken(self) -> Tuple[int, Token]:
# Will also raise StopIteration if there are no more,
# which is exactly what we want
stream = stream_value(next(self.streamiter))
log.debug("ContentParser starting new stream %r", stream)
self.newstream(stream.buffer)


Expand Down Expand Up @@ -763,7 +760,6 @@ def init_resources(self, page: Page, resources: Dict) -> None:
raise RuntimeError("Document no longer exists!")

for k, v in dict_value(self.resources).items():
log.debug("Resource: %r: %r", k, v)
if k == "Font":
for fontid, spec in dict_value(v).items():
objid = None
Expand Down Expand Up @@ -1233,12 +1229,6 @@ def __iter__(self) -> Iterator[LayoutDict]:
"PageInterpreter is deprecated and will be removed in PLAYA 0.3",
DeprecationWarning,
)
log.debug(
"PageInterpreter: resources=%r, streams=%r, ctm=%r",
self.resources,
self.contents,
self.ctm,
)
parser = ContentParser(self.contents)
for _, obj in parser:
# These are handled inside the parser as they don't obey
Expand All @@ -1250,7 +1240,6 @@ def __iter__(self) -> Iterator[LayoutDict]:
method, nargs = self._dispatch[obj]
if nargs:
args = self.pop(nargs)
log.debug("exec: %r %r", obj, args)
if len(args) == nargs:
gen = method(*args)
else:
Expand All @@ -1260,7 +1249,6 @@ def __iter__(self) -> Iterator[LayoutDict]:
obj,
)
else:
log.debug("exec: %r", obj)
gen = method()
if gen is not None:
yield from gen
Expand Down Expand Up @@ -1372,7 +1360,6 @@ def do_Do(self, xobjid_arg: PDFObject) -> Iterator[LayoutDict]:
except KeyError:
log.debug("Undefined xobject id: %r", xobjid)
return
log.debug("Processing xobj: %r", xobj)
subtype = xobj.get("Subtype")
if subtype is LITERAL_FORM and "BBox" in xobj:
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
Expand Down Expand Up @@ -2187,12 +2174,6 @@ class LazyInterpreter(BaseInterpreter):
textobj: List[TextItem] = []

def __iter__(self) -> Iterator[ContentObject]:
log.debug(
"LazyInterpreter: resources=%r, streams=%r, ctm=%r",
self.resources,
self.contents,
self.ctm,
)
parser = ContentParser(self.contents)
for _, obj in parser:
# These are handled inside the parser as they don't obey
Expand All @@ -2204,7 +2185,6 @@ def __iter__(self) -> Iterator[ContentObject]:
method, nargs = self._dispatch[obj]
if nargs:
args = self.pop(nargs)
log.debug("exec: %r %r", obj, args)
if len(args) == nargs:
gen = method(*args)
else:
Expand All @@ -2214,7 +2194,6 @@ def __iter__(self) -> Iterator[ContentObject]:
obj,
)
else:
log.debug("exec: %r", obj)
gen = method()
if gen is not None:
yield from gen
Expand Down Expand Up @@ -2518,7 +2497,6 @@ def do_Do(self, xobjid_arg: PDFObject) -> Iterator[ContentObject]:
except TypeError as e:
log.debug("Empty or invalid xobject with id %r: %s", xobjid, e)
return
log.debug("Processing xobj: %r", xobj)
subtype = xobj.get("Subtype")
if subtype is LITERAL_FORM and "BBox" in xobj:
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
Expand Down
8 changes: 0 additions & 8 deletions playa/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, Token]:
# PDF 1.7 sec 7.3.4.2: If the character following
# the REVERSE SOLIDUS is not one of those shown in
# Table 3, the REVERSE SOLIDUS shall be ignored.
log.debug("Unrecognized escape %r", m[0])
parts.append(chr)
else:
parts.append(bytes((ESC_STRING[chr],)))
Expand Down Expand Up @@ -351,7 +350,6 @@ def __next__(self) -> StackEntry:
if self.stack and top is None:
return self.stack.pop()
(pos, token) = self.nexttoken()
log.debug("token at %d: %r", pos, token)
if token is KEYWORD_ARRAY_BEGIN:
if top is None:
top = pos
Expand Down Expand Up @@ -456,10 +454,8 @@ def __next__(self) -> StackEntry:
# Try again with just plain b"EI"
self.seek(idpos + len(KEYWORD_ID.name) + 1)
(eipos, data) = self.get_inline_data(target=b"EI")
log.debug("data at %d: %r", eipos, data)
data = re.sub(rb"(?:\r\n|[\r\n])$", b"", data[: -len(eos)])
else:
log.debug("data at %d: %r", eipos, data)
data = re.sub(rb"\r$", b"", data[: -len(eos)])
else:
# Note absence of + 1 here (the "Unless" above)
Expand All @@ -474,7 +470,6 @@ def __next__(self) -> StackEntry:
if eipos == -1:
raise PDFSyntaxError("End of inline stream %r not found" % eos)
obj = InlineImage(dic, data)
log.debug("InlineImage @ %d: %r", pos, obj)
# Inline images must occur at the top level, otherwise
# something is wrong (probably a corrupt file)
assert (
Expand Down Expand Up @@ -575,7 +570,6 @@ def __next__(self) -> Tuple[int, IndirectObject]:
obj: Union[PDFObject, ContentStream]
while True:
pos, obj = next(self._parser)
log.debug("pos %r obj %r stack %r", pos, obj, self.trailer)
if obj is KEYWORD_OBJ:
pass
elif isinstance(obj, PSKeyword) and obj.name.startswith(b"endobj"):
Expand Down Expand Up @@ -641,7 +635,6 @@ def __next__(self) -> Tuple[int, IndirectObject]:
# marker after the data and before endstream; this
# marker shall not be included in the stream length.
linepos, line = self._parser.nextline()
log.debug("After stream data: %r %r", linepos, line)
if self.strict:
# In reality there usually is no end-of-line
# marker. We will nonetheless warn if there's
Expand All @@ -660,7 +653,6 @@ def __next__(self) -> Tuple[int, IndirectObject]:
objlen += len(line)
data += line
linepos, line = self._parser.nextline()
log.debug("After stream data: %r %r", linepos, line)
if line == b"": # Means EOF
log.warning(
"Incorrect length for stream, no 'endstream' found"
Expand Down

0 comments on commit 029cb0b

Please sign in to comment.