feat: Remove excessive debug logging for significant speedup

dhdaines · Dec 31, 2024 · 029cb0b · 029cb0b
1 parent 5b20c3f
commit 029cb0b
Show file tree

Hide file tree

Showing 5 changed files with 3 additions and 63 deletions.
diff --git a/playa/cmapdb.py b/playa/cmapdb.py
@@ -91,7 +91,6 @@ def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
         copy(self.code2cid, cmap.code2cid)
 
     def decode(self, code: bytes) -> Iterator[int]:
-        log.debug("decode: %r, %r", self, code)
         d = self.code2cid
         for i in iter(code):
             if i in d:
@@ -148,7 +147,6 @@ def __repr__(self) -> str:
         return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
 
     def get_unichr(self, cid: int) -> str:
-        log.debug("get_unichr: %r, %r", self, cid)
         return self.cid2unichr[cid]
 
     def dump(self, out: TextIO = sys.stdout) -> None:
@@ -159,7 +157,6 @@ def dump(self, out: TextIO = sys.stdout) -> None:
 class IdentityUnicodeMap(UnicodeMap):
     def get_unichr(self, cid: int) -> str:
         """Interpret character id as unicode codepoint"""
-        log.debug("get_unichr: %r, %r", self, cid)
         return chr(cid)
 
 
@@ -189,7 +186,6 @@ class CMapDB:
     def _load_data(cls, name: str) -> Any:
         name = name.replace("\0", "")
         filename = "%s.pickle.gz" % name
-        log.debug("loading: %r", name)
         cmap_paths = (
             os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
             os.path.join(os.path.dirname(__file__), "cmap"),
@@ -310,11 +306,10 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
     while True:
         try:
             pos, obj = next(parser)
-        except PDFSyntaxError as e:
+        except PDFSyntaxError:
             # CMap syntax is apparently not PDF syntax (e.g. "def"
             # seems to occur within dictionaries, for no apparent
             # reason, perhaps a PostScript thing?)
-            log.debug("Ignoring syntax error: %s", e)
             parser.reset()
             continue
         except StopIteration:
@@ -323,7 +318,6 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
         if not isinstance(obj, PSKeyword):
             stack.append(obj)
             continue
-        log.debug("keyword: %r (%r)", obj, stack)
         # Ignore everything outside begincmap / endcmap
         if obj is KEYWORD_BEGINCMAP:
             in_cmap = True
@@ -487,8 +481,7 @@ def parse_encoding(data: bytes) -> EncodingCMap:
     while True:
         try:
             pos, obj = next(parser)
-        except PDFSyntaxError as e:
-            log.debug("Ignoring syntax error: %s", e)
+        except PDFSyntaxError:
             parser.reset()
             continue
         except StopIteration:
@@ -497,7 +490,6 @@ def parse_encoding(data: bytes) -> EncodingCMap:
         if not isinstance(obj, PSKeyword):
             stack.append(obj)
             continue
-        log.debug("keyword: %r (%r)", obj, stack)
 
         if obj is KEYWORD_DEF:
             try:

diff --git a/playa/document.py b/playa/document.py
@@ -165,7 +165,6 @@ def _load(self, parser: ObjectParser) -> None:
                 if use_b != b"n":
                     continue
                 self.offsets[objid] = XRefPos(None, int(pos_b), int(genno_b))
-        log.debug("xref objects: %r", self.offsets)
         self._load_trailer(parser)
 
     def _load_trailer(self, parser: ObjectParser) -> None:
@@ -180,7 +179,6 @@ def _load_trailer(self, parser: ObjectParser) -> None:
             )
         (_, dic) = next(parser)
         self.trailer.update(dict_value(dic))
-        log.debug("trailer=%r", self.trailer)
 
     def __repr__(self) -> str:
         return "<XRefTable: offsets=%r>" % (self.offsets.keys())
@@ -245,7 +243,6 @@ def _load(self, parser: IndirectObjectParser) -> None:
             if token is KEYWORD_TRAILER:
                 _, dic = s2
                 self.trailer.update(dict_value(dic))
-                log.debug("trailer=%r", self.trailer)
                 return
             s1 = s2
         # If not, then try harder
@@ -258,7 +255,6 @@ def _load(self, parser: IndirectObjectParser) -> None:
                 if not isinstance(trailer, dict):
                     break
                 self.trailer.update(trailer)
-                log.debug("trailer=%r", self.trailer)
                 return
         log.warning("b'trailer' not found in document or invalid")
 
@@ -303,13 +299,6 @@ def _load(self, parser: IndirectObjectParser) -> None:
         self.data = stream.buffer
         self.entlen = self.fl1 + self.fl2 + self.fl3
         self.trailer = stream.attrs
-        log.debug(
-            "xref stream: objid=%s, fields=%d,%d,%d",
-            ", ".join(map(repr, self.ranges)),
-            self.fl1,
-            self.fl2,
-            self.fl3,
-        )
 
     @property
     def objids(self) -> Iterator[int]:
@@ -1027,7 +1016,6 @@ def _get_objects(self, stream: ContentStream) -> Tuple[List[PDFObject], int]:
 
     def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
         assert self.parser is not None
-        log.debug("getobj_parse: seeking to %d for objid %d", pos, objid)
         self.parser.seek(pos)
         try:
             _, obj = next(self.parser)
@@ -1053,7 +1041,6 @@ def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
                 raise PDFSyntaxError(
                     f"Indirect object {objid!r} not found in document"
                 ) from e
-            log.debug("found object (%r) seeking to %r", m.group(0), realpos)
             self.parser.seek(realpos)
             (_, obj) = next(self.parser)
         if obj.objid != objid:
@@ -1079,14 +1066,12 @@ def __getitem__(self, objid: int) -> PDFObject:
         if not self.xrefs:
             raise ValueError("Document is not initialized")
         if objid not in self._cached_objs:
-            log.debug("getobj: objid=%r", objid)
             obj = None
             for xref in self.xrefs:
                 try:
                     (strmid, index, genno) = xref.get_pos(objid)
                 except KeyError:
                     continue
-                log.debug("getobj: strmid %r index %r genno %r", strmid, index, genno)
                 try:
                     if strmid is not None:
                         stream = stream_value(self[strmid])
@@ -1103,14 +1088,12 @@ def __getitem__(self, objid: int) -> PDFObject:
                     continue
             if obj is None:
                 raise IndexError(f"Object with ID {objid} not found")
-            log.debug("register: objid=%r: %r", objid, obj)
             self._cached_objs[objid] = obj
         return self._cached_objs[objid]
 
     def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
         if objid and objid in self._cached_fonts:
             return self._cached_fonts[objid]
-        log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
         if spec.get("Type") is not LITERAL_FONT:
             log.warning("Font specification Type is not /Font: %r", spec)
         # Create a Font object.
@@ -1258,11 +1241,9 @@ def _get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
                 log.warning("Page has no Type, trying type: %r", object_properties)
                 object_type = object_properties.get("type")
             if object_type is LITERAL_PAGES and "Kids" in object_properties:
-                log.debug("Pages: Kids=%r", object_properties["Kids"])
                 for child in reversed(list_value(object_properties["Kids"])):
                     stack.append((child, object_properties))
             elif object_type is LITERAL_PAGE:
-                log.debug("Page: %r", object_properties)
                 yield object_id, object_properties
 
     @property
@@ -1320,9 +1301,7 @@ def _find_xref(self) -> int:
         prev = b""
         for pos, line in reverse_iter_lines(self.buffer):
             line = line.strip()
-            log.debug("find_xref: %r", line)
             if line == b"startxref":
-                log.debug("xref found: pos=%r", prev)
                 if not prev.isdigit():
                     log.warning("Invalid startxref position: %r", prev)
                     continue
@@ -1355,7 +1334,6 @@ def _read_xref_from(
             (pos, token) = parser.nexttoken()
         except StopIteration:
             raise ValueError("Unexpected EOF at {start}")
-        log.debug("read_xref_from: start=%d, token=%r", start, token)
         if token is KEYWORD_XREF:
             parser.nextline()
             xref: XRef = XRefTable(parser)

diff --git a/playa/encodingdb.py b/playa/encodingdb.py
@@ -120,6 +120,6 @@ def get_encoding(
                     try:
                         cid2unicode[cid] = name2unicode(cast(str, x.name))
                     except (KeyError, ValueError) as e:
-                        log.debug(str(e))
+                        log.debug("Failed to get char %r: %s", x, e)
                     cid += 1
         return cid2unicode
diff --git a/playa/page.py b/playa/page.py
@@ -663,10 +663,8 @@ def __init__(self, streams: Iterable[PDFObject]) -> None:
         self.streamiter = iter(streams)
         try:
             stream = stream_value(next(self.streamiter))
-            log.debug("ContentParser starting stream %r", stream)
             super().__init__(stream.buffer)
         except StopIteration:
-            log.debug("ContentParser has no content, returning nothing")
             super().__init__(b"")
 
     def nexttoken(self) -> Tuple[int, Token]:
@@ -682,7 +680,6 @@ def nexttoken(self) -> Tuple[int, Token]:
                 # Will also raise StopIteration if there are no more,
                 # which is exactly what we want
                 stream = stream_value(next(self.streamiter))
-                log.debug("ContentParser starting new stream %r", stream)
                 self.newstream(stream.buffer)
 
 
@@ -763,7 +760,6 @@ def init_resources(self, page: Page, resources: Dict) -> None:
             raise RuntimeError("Document no longer exists!")
 
         for k, v in dict_value(self.resources).items():
-            log.debug("Resource: %r: %r", k, v)
             if k == "Font":
                 for fontid, spec in dict_value(v).items():
                     objid = None
@@ -1233,12 +1229,6 @@ def __iter__(self) -> Iterator[LayoutDict]:
             "PageInterpreter is deprecated and will be removed in PLAYA 0.3",
             DeprecationWarning,
         )
-        log.debug(
-            "PageInterpreter: resources=%r, streams=%r, ctm=%r",
-            self.resources,
-            self.contents,
-            self.ctm,
-        )
         parser = ContentParser(self.contents)
         for _, obj in parser:
             # These are handled inside the parser as they don't obey
@@ -1250,7 +1240,6 @@ def __iter__(self) -> Iterator[LayoutDict]:
                     method, nargs = self._dispatch[obj]
                     if nargs:
                         args = self.pop(nargs)
-                        log.debug("exec: %r %r", obj, args)
                         if len(args) == nargs:
                             gen = method(*args)
                         else:
@@ -1260,7 +1249,6 @@ def __iter__(self) -> Iterator[LayoutDict]:
                                 obj,
                             )
                     else:
-                        log.debug("exec: %r", obj)
                         gen = method()
                     if gen is not None:
                         yield from gen
@@ -1372,7 +1360,6 @@ def do_Do(self, xobjid_arg: PDFObject) -> Iterator[LayoutDict]:
         except KeyError:
             log.debug("Undefined xobject id: %r", xobjid)
             return
-        log.debug("Processing xobj: %r", xobj)
         subtype = xobj.get("Subtype")
         if subtype is LITERAL_FORM and "BBox" in xobj:
             matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
@@ -2187,12 +2174,6 @@ class LazyInterpreter(BaseInterpreter):
     textobj: List[TextItem] = []
 
     def __iter__(self) -> Iterator[ContentObject]:
-        log.debug(
-            "LazyInterpreter: resources=%r, streams=%r, ctm=%r",
-            self.resources,
-            self.contents,
-            self.ctm,
-        )
         parser = ContentParser(self.contents)
         for _, obj in parser:
             # These are handled inside the parser as they don't obey
@@ -2204,7 +2185,6 @@ def __iter__(self) -> Iterator[ContentObject]:
                     method, nargs = self._dispatch[obj]
                     if nargs:
                         args = self.pop(nargs)
-                        log.debug("exec: %r %r", obj, args)
                         if len(args) == nargs:
                             gen = method(*args)
                         else:
@@ -2214,7 +2194,6 @@ def __iter__(self) -> Iterator[ContentObject]:
                                 obj,
                             )
                     else:
-                        log.debug("exec: %r", obj)
                         gen = method()
                     if gen is not None:
                         yield from gen
@@ -2518,7 +2497,6 @@ def do_Do(self, xobjid_arg: PDFObject) -> Iterator[ContentObject]:
         except TypeError as e:
             log.debug("Empty or invalid xobject with id %r: %s", xobjid, e)
             return
-        log.debug("Processing xobj: %r", xobj)
         subtype = xobj.get("Subtype")
         if subtype is LITERAL_FORM and "BBox" in xobj:
             matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))

diff --git a/playa/parser.py b/playa/parser.py
@@ -260,7 +260,6 @@ def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, Token]:
                     # PDF 1.7 sec 7.3.4.2: If the character following
                     # the REVERSE SOLIDUS is not one of those shown in
                     # Table 3, the REVERSE SOLIDUS shall be ignored.
-                    log.debug("Unrecognized escape %r", m[0])
                     parts.append(chr)
                 else:
                     parts.append(bytes((ESC_STRING[chr],)))
@@ -351,7 +350,6 @@ def __next__(self) -> StackEntry:
             if self.stack and top is None:
                 return self.stack.pop()
             (pos, token) = self.nexttoken()
-            log.debug("token at %d: %r", pos, token)
             if token is KEYWORD_ARRAY_BEGIN:
                 if top is None:
                     top = pos
@@ -456,10 +454,8 @@ def __next__(self) -> StackEntry:
                         # Try again with just plain b"EI"
                         self.seek(idpos + len(KEYWORD_ID.name) + 1)
                         (eipos, data) = self.get_inline_data(target=b"EI")
-                        log.debug("data at %d: %r", eipos, data)
                         data = re.sub(rb"(?:\r\n|[\r\n])$", b"", data[: -len(eos)])
                     else:
-                        log.debug("data at %d: %r", eipos, data)
                         data = re.sub(rb"\r$", b"", data[: -len(eos)])
                 else:
                     # Note absence of + 1 here (the "Unless" above)
@@ -474,7 +470,6 @@ def __next__(self) -> StackEntry:
                 if eipos == -1:
                     raise PDFSyntaxError("End of inline stream %r not found" % eos)
                 obj = InlineImage(dic, data)
-                log.debug("InlineImage @ %d: %r", pos, obj)
                 # Inline images must occur at the top level, otherwise
                 # something is wrong (probably a corrupt file)
                 assert (
@@ -575,7 +570,6 @@ def __next__(self) -> Tuple[int, IndirectObject]:
         obj: Union[PDFObject, ContentStream]
         while True:
             pos, obj = next(self._parser)
-            log.debug("pos %r obj %r stack %r", pos, obj, self.trailer)
             if obj is KEYWORD_OBJ:
                 pass
             elif isinstance(obj, PSKeyword) and obj.name.startswith(b"endobj"):
@@ -641,7 +635,6 @@ def __next__(self) -> Tuple[int, IndirectObject]:
                 # marker after the data and before endstream; this
                 # marker shall not be included in the stream length.
                 linepos, line = self._parser.nextline()
-                log.debug("After stream data: %r %r", linepos, line)
                 if self.strict:
                     # In reality there usually is no end-of-line
                     # marker.  We will nonetheless warn if there's
@@ -660,7 +653,6 @@ def __next__(self) -> Tuple[int, IndirectObject]:
                         objlen += len(line)
                         data += line
                         linepos, line = self._parser.nextline()
-                        log.debug("After stream data: %r %r", linepos, line)
                         if line == b"":  # Means EOF
                             log.warning(
                                 "Incorrect length for stream, no 'endstream' found"