diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a0a63c3c..8ebbde55 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,6 +10,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + submodules: true - name: Set up Python uses: actions/setup-python@v5 with: diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..b0d1051e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "tests/3rdparty/pdfplumber"] + path = samples/3rdparty/pdfplumber + url = https://github.com/jsvine/pdfplumber.git +[submodule "tests/3rdparty/pdf.js"] + path = samples/3rdparty/pdf.js + url = https://github.com/mozilla/pdf.js.git diff --git a/playa/cmapdb.py b/playa/cmapdb.py index 71dcf88c..3ce8c097 100644 --- a/playa/cmapdb.py +++ b/playa/cmapdb.py @@ -209,6 +209,8 @@ def _load_data(cls, name: str) -> Any: def get_cmap(cls, name: str) -> CMapBase: if name == "Identity-H": return IdentityCMap(WMode=0) + elif name == "Adobe-Identity-UCS": + return IdentityCMap(WMode=0) # FIXME: WMode??? elif name == "Identity-V": return IdentityCMap(WMode=1) elif name == "OneByteIdentityH": diff --git a/playa/color.py b/playa/color.py index aca0f951..8ceab4a2 100644 --- a/playa/color.py +++ b/playa/color.py @@ -1,8 +1,7 @@ from typing import Dict, NamedTuple, Union, Tuple -from playa.exceptions import PDFInterpreterError from playa.parser import LIT, PDFObject, PSLiteral -from playa.pdftypes import num_value, list_value, literal_name, stream_value +from playa.pdftypes import num_value, list_value, literal_name, stream_value, resolve1 LITERAL_DEVICE_GRAY = LIT("DeviceGray") LITERAL_DEVICE_RGB = LIT("DeviceRGB") @@ -37,15 +36,12 @@ class ColorSpace(NamedTuple): spec: PDFObject = None def make_color(self, *components) -> Color: - if len(components) != self.ncomponents: - raise PDFInterpreterError( - "%s requires %d components, got %d!" - % (self.name, self.ncomponents, len(components)) - ) - nc = self.ncomponents pattern = None - if isinstance(components[-1], PSLiteral): + nc = self.ncomponents + if components and isinstance(components[-1], PSLiteral): pattern = components[-1].name + components = components[:-1] + # Remove the pattern we added to ncomponents nc -= 1 cc = [] for x in components[:nc]: @@ -93,14 +89,14 @@ def get_colorspace( raise ValueError( "Underlying colour space cannot be /Pattern: %r" % (spec,) ) - underlying = get_colorspace(spec[1]) + underlying = get_colorspace(resolve1(spec[1])) if underlying is None: raise ValueError("Unrecognized underlying colour space: %r", (spec,)) # Not super important what we call it but we need to know it # has N+1 "components" (the last one being the pattern) return ColorSpace(name, underlying.ncomponents + 1, spec) else: - cs = PREDEFINED_COLORSPACE.get(literal_name(spec[0])) + cs = PREDEFINED_COLORSPACE.get(literal_name(resolve1(spec[0]))) if cs is None: return None return ColorSpace(cs.name, cs.ncomponents, spec) diff --git a/playa/document.py b/playa/document.py index b6436208..96817733 100644 --- a/playa/document.py +++ b/playa/document.py @@ -8,13 +8,11 @@ import mmap import re import struct -from collections import deque from hashlib import md5, sha256, sha384, sha512 from typing import ( Any, BinaryIO, Callable, - Deque, Dict, Iterable, Iterator, @@ -50,9 +48,9 @@ schema as page_schema, ) from playa.parser import ( - KEYWORD_OBJ, KEYWORD_TRAILER, KEYWORD_XREF, + KEYWORD_OBJ, LIT, IndirectObject, IndirectObjectParser, @@ -211,6 +209,9 @@ def __repr__(self) -> str: def _load(self, parser: IndirectObjectParser) -> None: parser.seek(0) parser.reset() + doc = None if parser.doc is None else parser.doc() + if doc is None: + raise RuntimeError("Document no longer exists!") # Get all the objects for pos, obj in parser: self.offsets[obj.objid] = XRefPos(None, pos, obj.genno) @@ -225,9 +226,6 @@ def _load(self, parser: IndirectObjectParser) -> None: except KeyError: log.warning("N is not defined in object stream: %r", stream) n = 0 - doc = None if parser.doc is None else parser.doc() - if doc is None: - raise RuntimeError("Document no longer exists!") parser1 = ObjectParser(stream.buffer, doc) objs: List = [obj for _, obj in parser1] # FIXME: This is choplist @@ -244,10 +242,17 @@ def _load(self, parser: IndirectObjectParser) -> None: _, dic = s2 self.trailer.update(dict_value(dic)) log.debug("trailer=%r", self.trailer) - break + return s1 = s2 - else: - log.warning("b'trailer' not found in document") + # If not, then try harder + for pos, line in reverse_iter_lines(parser.buffer): + line = line.strip() + if line == b"trailer": + _, trailer = next(ObjectParser(parser.buffer, doc, pos + len(b"trailer"))) + self.trailer.update(trailer) + log.debug("trailer=%r", self.trailer) + return + log.warning("b'trailer' not found in document") @property def objids(self) -> Iterable[int]: @@ -508,7 +513,8 @@ def init_params(self) -> None: raise PDFEncryptionError(error_msg) self.cfm = {} for k, v in self.cf.items(): - f = self.get_cfm(literal_name(v["CFM"])) + dictv = dict_value(v) + f = self.get_cfm(literal_name(dictv["CFM"])) if f is None: error_msg = "Unknown crypt filter method: param=%r" % self.param raise PDFEncryptionError(error_msg) @@ -833,13 +839,14 @@ def __init__( except PDFSyntaxError: log.warning("PDF header not found, will try to read the file anyway") self.pdf_version = "UNKNOWN" + # Make sure we read the whole file if we need to read the file! + fp.seek(0, 0) try: self.buffer: Union[bytes, mmap.mmap] = mmap.mmap( fp.fileno(), 0, access=mmap.ACCESS_READ ) except io.UnsupportedOperation: log.warning("mmap not supported on %r, reading document into memory", fp) - fp.seek(0, 0) self.buffer = fp.read() except ValueError: raise @@ -853,7 +860,7 @@ def __init__( try: pos = self._find_xref() self._read_xref_from(pos, self.xrefs) - except (ValueError, IndexError) as e: + except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e: log.debug("Using fallback XRef parsing: %s", e) newxref = XRefFallback(self.parser) self.xrefs.append(newxref) @@ -874,7 +881,10 @@ def __init__( self.encryption = (id_value, dict_value(trailer["Encrypt"])) self._initialize_password(password) if "Info" in trailer: - self.info.append(dict_value(trailer["Info"])) + try: + self.info.append(dict_value(trailer["Info"])) + except TypeError: + log.warning("Info is a broken reference (incorrect xref table?)") if "Root" in trailer: # Every PDF file must have exactly one /Root dictionary. try: @@ -982,30 +992,30 @@ def _getobj_parse(self, pos: int, objid: int) -> PDFObject: self.parser.seek(pos) try: _, obj = next(self.parser) - except (ValueError, IndexError) as e: + if obj.objid != objid: + raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}") + except (ValueError, IndexError, PDFSyntaxError) as e: log.warning( "Indirect object %d not found at position %d: %r", objid, pos, e ) - # Hack around malformed pdf files where the offset in the + # In case of malformed pdf files where the offset in the # xref table doesn't point exactly at the object - # definition (probably more frequent than you think). - # Back up a bit, then parse forward until we find the right - # object. Fixes - # https://github.com/pdfminer/pdfminer.six/issues/56 - tokenizer = Lexer(self.buffer, max(0, pos - 16)) - q: Deque[int] = deque([], 3) - while True: - try: - (pos, token) = next(tokenizer) - except StopIteration: - raise PDFSyntaxError( - f"Indirect object {objid!r} not found at or after position {pos}" - ) - q.append(pos) - if len(q) == 3 and token is KEYWORD_OBJ: - break - log.debug("seeking to %r", q[0]) - self.parser.seek(q[0]) + # definition (probably more frequent than you think), just + # use a regular expression to find the object because we + # can do that. + realpos = -1 + lastgen = -1 + for m in re.finditer(rb"%d\s+(\d+)\s+obj" % objid, self.buffer): + genno = int(m.group(1)) + if genno > lastgen: + lastgen = genno + realpos = m.start(0) + if realpos == -1: + raise PDFSyntaxError( + f"Indirect object {objid!r} not found in document" + ) from e + log.debug("found object (%r) seeking to %r", m.group(0), realpos) + self.parser.seek(realpos) (_, obj) = next(self.parser) if obj.objid != objid: raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}") @@ -1038,7 +1048,12 @@ def __getitem__(self, objid: int) -> Any: else: obj = self._getobj_parse(index, objid) break - except (StopIteration, PDFSyntaxError): + # FIXME: We might not actually want to catch these... + except StopIteration: + log.debug("EOF when searching for object %d", objid) + continue + except PDFSyntaxError as e: + log.debug("Syntax error when searching for object %d: %s", objid, e) continue if obj is None: raise IndexError(f"Object with ID {objid} not found") @@ -1050,14 +1065,14 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> Font: if objid and objid in self._cached_fonts: return self._cached_fonts[objid] log.debug("get_font: create: objid=%r, spec=%r", objid, spec) - if spec["Type"] is not LITERAL_FONT: + if spec.get("Type") is not LITERAL_FONT: log.warning("Font specification Type is not /Font: %r", spec) # Create a Font object. if "Subtype" in spec: subtype = literal_name(spec["Subtype"]) else: log.warning("Font specification Subtype is not specified: %r", spec) - subtype = "Type1" + subtype = "" if subtype in ("Type1", "MMType1"): # Type1 Font font: Font = Type1Font(spec) @@ -1075,6 +1090,7 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> Font: dfonts = list_value(spec["DescendantFonts"]) assert dfonts subspec = dict_value(dfonts[0]).copy() + # FIXME: Bad tightly coupled with internals of CIDFont for k in ("Encoding", "ToUnicode"): if k in spec: subspec[k] = resolve1(spec[k]) @@ -1255,19 +1271,27 @@ def _find_xref(self) -> int: """Internal function used to locate the first XRef.""" # search the last xref table by scanning the file backwards. prev = b"" - # FIXME: This will scan *the whole file* looking for an xref - # table, it should maybe give up sooner? - for line in reverse_iter_lines(self.buffer): + for pos, line in reverse_iter_lines(self.buffer): line = line.strip() log.debug("find_xref: %r", line) if line == b"startxref": log.debug("xref found: pos=%r", prev) if not prev.isdigit(): - raise ValueError(f"Invalid xref position: {prev!r}") + log.warning("Invalid startxref position: %r", prev) + continue start = int(prev) if not start >= 0: - raise ValueError(f"Invalid negative xref position: {start}") + log.warning("Invalid negative startxref position: %d", start) + continue + elif start > pos: + log.warning("Invalid startxref position (> %d): %d", pos, start) + continue return start + elif line == b"xref": + return pos + elif line == b"endobj": + # Okay, we're probably not in Kansas anymore... + break if line: prev = line raise ValueError("No xref table found at end of file") @@ -1285,15 +1309,23 @@ def _read_xref_from( except StopIteration: raise ValueError("Unexpected EOF at {start}") log.debug("read_xref_from: start=%d, token=%r", start, token) - if isinstance(token, int): - # XRefStream: PDF-1.5 - self.parser.seek(pos) - self.parser.reset() - xref: XRef = XRefStream(self.parser) - else: - if token is KEYWORD_XREF: - parser.nextline() + if token is KEYWORD_XREF: + parser.nextline() xref = XRefTable(parser) + else: + # It might be an XRefStream, if this is an indirect object... + _, token2 = parser.nexttoken() + _, token3 = parser.nexttoken() + if token3 is KEYWORD_OBJ: + # XRefStream: PDF-1.5 + self.parser.seek(pos) + self.parser.reset() + xref: XRef = XRefStream(self.parser) + else: + # Well, maybe it's an XRef table without "xref" (but + # probably not) + parser.seek(pos) + xref = XRefTable(parser) xrefs.append(xref) trailer = xref.trailer # For hybrid-reference files, an additional set of xrefs as a @@ -1319,10 +1351,10 @@ def __init__(self, doc: Document): self._pages = [] self._labels: Dict[str, Page] = {} try: - itor = doc._get_page_objects() - except KeyError: - itor = doc._get_pages_from_xrefs() - for page_idx, ((objid, properties), label) in enumerate(zip(itor, page_labels)): + page_objects = list(doc._get_page_objects()) + except (KeyError, IndexError): + page_objects = list(doc._get_pages_from_xrefs()) + for page_idx, ((objid, properties), label) in enumerate(zip(page_objects, page_labels)): page = Page(doc, objid, properties, label, page_idx, doc.space) self._pages.append(page) if label is not None: diff --git a/playa/font.py b/playa/font.py index 3399b6ed..018b09a4 100644 --- a/playa/font.py +++ b/playa/font.py @@ -50,7 +50,15 @@ resolve_all, stream_value, ) -from playa.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack +from playa.utils import ( + Matrix, + Point, + Rect, + apply_matrix_norm, + choplist, + nunpack, + decode_text, +) log = logging.getLogger(__name__) @@ -1032,11 +1040,13 @@ def __init__( log.warning("Font spec is missing BaseFont: %r", spec) self.basefont = "unknown" self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) - cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( - "latin1", + # These are *supposed* to be ASCII (PDF 1.7 section 9.7.3), + # but for whatever reason they are sometimes UTF-16BE + cid_registry = decode_text( + resolve1(self.cidsysteminfo.get("Registry", b"unknown")) ) - cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( - "latin1", + cid_ordering = decode_text( + resolve1(self.cidsysteminfo.get("Ordering", b"unknown")) ) self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" self.cmap: CMapBase = self.get_cmap_from_spec(spec) @@ -1051,11 +1061,18 @@ def __init__( self.fontfile = stream_value(descriptor.get("FontFile2")) ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.buffer)) self.unicode_map: Optional[UnicodeMap] = None + # FIXME: This is magical and means that we are not actually a + # CIDFont but really a Type0 font. if "ToUnicode" in spec: if isinstance(spec["ToUnicode"], ContentStream): strm = stream_value(spec["ToUnicode"]) self.unicode_map = parse_tounicode(strm.buffer) - else: + if isinstance(spec["Encoding"], ContentStream): + strm = stream_value(spec["Encoding"]) + # FIXME: it's not a tounicode, but it plays one on TV + # _ = parse_tounicode(strm.buffer) + + if self.unicode_map is None: cmap_name = literal_name(spec["ToUnicode"]) encoding = literal_name(spec["Encoding"]) if ( diff --git a/playa/page.py b/playa/page.py index 62f8a7c4..e2448b63 100644 --- a/playa/page.py +++ b/playa/page.py @@ -34,8 +34,8 @@ get_colorspace, ) from playa.exceptions import ( - PDFInterpreterError, PDFUnicodeNotDefined, + PDFSyntaxError, ) from playa.font import Font @@ -67,7 +67,6 @@ get_transformed_bound, make_compat_bytes, mult_matrix, - parse_rect, normalize_rect, translate_matrix, ) @@ -98,6 +97,17 @@ def Object(*args, **kwargs): ... DeviceSpace = Literal["page", "screen", "user"] +# FIXME: This should go in utils/pdftypes but there are circular imports +def parse_rect(o: PDFObject) -> Rect: + try: + (x0, y0, x1, y1) = (num_value(x) for x in list_value(o)) + return x0, y0, x1, y1 + except ValueError: + raise ValueError("Could not parse rectangle %r" % (o,)) + except TypeError: + raise PDFSyntaxError("Rectangle contains non-numeric values") + + class Page: """An object that holds the information about a page. @@ -137,38 +147,39 @@ def __init__( self.page_idx = page_idx self.space = space self.lastmod = resolve1(self.attrs.get("LastModified")) - self.resources: Dict[str, PDFObject] = dict_value( - self.attrs.get("Resources", {}) - ) - if "MediaBox" in self.attrs: - self.mediabox = normalize_rect( - parse_rect(resolve1(val) for val in resolve1(self.attrs["MediaBox"])) + try: + self.resources: Dict[str, PDFObject] = dict_value( + self.attrs.get("Resources") ) + except TypeError: + log.warning("Resources missing or invalid from Page id %d", pageid) + self.resources = {} + if "MediaBox" in self.attrs: + self.mediabox = normalize_rect(parse_rect(self.attrs["MediaBox"])) else: log.warning( - "MediaBox missing from /Page (and not inherited)," - " defaulting to US Letter (612x792)" + "MediaBox missing from Page id %d (and not inherited)," + " defaulting to US Letter (612x792)", pageid ) self.mediabox = (0, 0, 612, 792) self.cropbox = self.mediabox if "CropBox" in self.attrs: try: - self.cropbox = normalize_rect( - parse_rect(resolve1(val) for val in resolve1(self.attrs["CropBox"])) - ) + self.cropbox = normalize_rect(parse_rect(self.attrs["CropBox"])) except ValueError: log.warning("Invalid CropBox in /Page, defaulting to MediaBox") self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 self.annots = self.attrs.get("Annots") self.beads = self.attrs.get("B") - if "Contents" in self.attrs: - self._contents: List[PDFObject] = resolve1(self.attrs["Contents"]) - assert self._contents is not None - if not isinstance(self._contents, list): - self._contents = [self._contents] - else: + contents = resolve1(self.attrs.get("Contents")) + if contents is None: self._contents = [] + else: + if isinstance(contents, list): + self._contents = contents + else: + self._contents = [contents] @property def streams(self) -> Iterator[ContentStream]: @@ -737,8 +748,12 @@ def init_resources(self, page: Page, resources: Dict) -> None: objid = None if isinstance(spec, ObjRef): objid = spec.objid - spec = dict_value(spec) - self.fontmap[fontid] = doc.get_font(objid, spec) + try: + spec = dict_value(spec) + self.fontmap[fontid] = doc.get_font(objid, spec) + except TypeError: + log.warning("Broken/missing font spec for %r", fontid) + self.fontmap[fontid] = doc.get_font(objid, {}) elif k == "ColorSpace": for csid, spec in dict_value(v).items(): colorspace = get_colorspace(resolve1(spec), csid) @@ -1208,11 +1223,11 @@ def __iter__(self) -> Iterator[LayoutDict]: if len(args) == nargs: gen = method(*args) else: - error_msg = ( - "Insufficient arguments (%d) for operator: %r" - % (len(args), obj) + log.warning( + "Insufficient arguments (%d) for operator: %r", + len(args), + obj, ) - raise PDFInterpreterError(error_msg) else: log.debug("exec: %r", obj) gen = method() @@ -2121,17 +2136,18 @@ def __iter__(self) -> Iterator[ContentObject]: if len(args) == nargs: gen = method(*args) else: - error_msg = ( - "Insufficient arguments (%d) for operator: %r" - % (len(args), obj) + log.warning( + "Insufficient arguments (%d) for operator: %r", + len(args), + obj, ) - raise PDFInterpreterError(error_msg) else: log.debug("exec: %r", obj) gen = method() if gen is not None: yield from gen else: + # TODO: This can get very verbose log.warning("Unknown operator: %r", obj) else: self.push(obj) @@ -2427,6 +2443,9 @@ def do_Do(self, xobjid_arg: PDFObject) -> Iterator[ContentObject]: except KeyError: log.debug("Undefined xobject id: %r", xobjid) return + except TypeError as e: + log.debug("Empty or invalid xobject with id %r: %s", xobjid, e) + return log.debug("Processing xobj: %r", xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: diff --git a/playa/parser.py b/playa/parser.py index 964e2320..ce667414 100644 --- a/playa/parser.py +++ b/playa/parser.py @@ -77,7 +77,7 @@ } -def reverse_iter_lines(buffer: Union[bytes, mmap.mmap]) -> Iterator[bytes]: +def reverse_iter_lines(buffer: Union[bytes, mmap.mmap]) -> Iterator[Tuple[int, bytes]]: """Iterate backwards over lines starting at the current position. This is used to locate the trailers at the end of a file. @@ -87,10 +87,9 @@ def reverse_iter_lines(buffer: Union[bytes, mmap.mmap]) -> Iterator[bytes]: nidx = buffer.rfind(b"\n", 0, pos) ridx = buffer.rfind(b"\r", 0, pos) best = max(nidx, ridx) + yield best + 1, buffer[best + 1 : endline] if best == -1: - yield buffer[:endline] break - yield buffer[best + 1 : endline] endline = best + 1 pos = best if pos > 0 and buffer[pos - 1 : pos + 1] == b"\r\n": @@ -561,6 +560,7 @@ def __init__( strict: bool = False, ) -> None: self._parser = ObjectParser(data, doc) + self.buffer = data self.trailer: List[Tuple[int, Union[PDFObject, ContentStream]]] = [] self.doc = None if doc is None else weakref.ref(doc) self.strict = strict @@ -580,8 +580,13 @@ def __next__(self) -> Tuple[int, IndirectObject]: (_, obj) = self.trailer.pop() (_, genno) = self.trailer.pop() (pos, objid) = self.trailer.pop() - objid = int_value(objid) - genno = int_value(genno) + try: + objid = int_value(objid) + genno = int_value(genno) + except TypeError as e: + raise PDFSyntaxError( + f"Object numbers must be integers, got {objid!r} {genno!r}" + ) from e # ContentStream is *special* and needs these # internally for decryption. if isinstance(obj, ContentStream): @@ -652,7 +657,7 @@ def __next__(self) -> Tuple[int, IndirectObject]: log.debug("After stream data: %r %r", linepos, line) if line == b"": # Means EOF log.warning( - "Incorrect legnth for stream, no 'endstream' found" + "Incorrect length for stream, no 'endstream' found" ) break doc = None if self.doc is None else self.doc() diff --git a/playa/utils.py b/playa/utils.py index cb3b7f41..accc7a60 100644 --- a/playa/utils.py +++ b/playa/utils.py @@ -2,7 +2,6 @@ import string from typing import ( - Any, Iterable, Iterator, List, @@ -11,8 +10,6 @@ Union, ) -from playa.exceptions import PDFSyntaxError - def make_compat_bytes(in_str: str) -> bytes: """Converts to bytes, encoding to unicode.""" @@ -180,16 +177,6 @@ def apply_png_predictor( MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) -def parse_rect(o: Any) -> Rect: - try: - (x0, y0, x1, y1) = o - return float(x0), float(y0), float(x1), float(y1) - except ValueError: - raise ValueError("Could not parse rectangle") - except TypeError: - raise PDFSyntaxError("Rectangle contains non-numeric values") - - def normalize_rect(r: Rect) -> Rect: (x0, y0, x1, y1) = r if x1 < x0: diff --git a/pyproject.toml b/pyproject.toml index 6fc6c598..186bd3e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,13 @@ playa = "playa.cli:main" [tool.hatch.version] path = "playa/__init__.py" +[tool.hatch.build.targets.sdist] +exclude = [ + "/.github", + "/samples/contrib", + "/samples/3rdparty", +] + [tool.hatch.build.targets.wheel] packages = ["playa"] @@ -53,6 +60,9 @@ config-path = "none" # Disable hatch's unreasonable ruff defaults [tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" +[tool.ruff] +exclude = ["samples/3rdparty"] # why no leading slash? + [tool.pytest.ini_options] testpaths = [ "tests" ] diff --git a/samples/3rdparty/pdf.js b/samples/3rdparty/pdf.js new file mode 160000 index 00000000..d4489531 --- /dev/null +++ b/samples/3rdparty/pdf.js @@ -0,0 +1 @@ +Subproject commit d4489531668b172a5ffd4ac6a29c3eb856af79a6 diff --git a/samples/3rdparty/pdfplumber b/samples/3rdparty/pdfplumber new file mode 160000 index 00000000..e921ea74 --- /dev/null +++ b/samples/3rdparty/pdfplumber @@ -0,0 +1 @@ +Subproject commit e921ea748b245b9686540fb89b0da0c4125c31ec diff --git a/samples/bogus-stream-length.pdf b/samples/bogus-stream-length.pdf deleted file mode 100644 index 12a4c5dc..00000000 Binary files a/samples/bogus-stream-length.pdf and /dev/null differ diff --git "a/samples/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf" "b/samples/contrib/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf" similarity index 100% rename from "samples/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf" rename to "samples/contrib/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf" diff --git a/samples/2023-06-20-PV.pdf b/samples/contrib/2023-06-20-PV.pdf similarity index 100% rename from samples/2023-06-20-PV.pdf rename to samples/contrib/2023-06-20-PV.pdf diff --git a/samples/pdf.js/basicapi.pdf b/samples/contrib/basicapi.pdf similarity index 100% rename from samples/pdf.js/basicapi.pdf rename to samples/contrib/basicapi.pdf diff --git a/samples/pdfplumber/issue-1181.pdf b/samples/contrib/issue-1181.pdf similarity index 100% rename from samples/pdfplumber/issue-1181.pdf rename to samples/contrib/issue-1181.pdf diff --git a/samples/pdf.js/issue620f.pdf b/samples/contrib/issue620f.pdf similarity index 100% rename from samples/pdf.js/issue620f.pdf rename to samples/contrib/issue620f.pdf diff --git a/samples/contrib/test_pdf_with_tiff_predictor.pdf b/samples/test_pdf_with_tiff_predictor.pdf similarity index 100% rename from samples/contrib/test_pdf_with_tiff_predictor.pdf rename to samples/test_pdf_with_tiff_predictor.pdf diff --git a/tests/3rdparty/test_pdf_js.py b/tests/3rdparty/test_pdf_js.py deleted file mode 100644 index 57a835ba..00000000 --- a/tests/3rdparty/test_pdf_js.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Run pdf.js testsuite if present. -""" diff --git a/tests/3rdparty/test_pdfplumber.py b/tests/3rdparty/test_pdfplumber.py deleted file mode 100644 index 5fd38217..00000000 --- a/tests/3rdparty/test_pdfplumber.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Test pdfplumber (using playa branch) if present. -""" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/data.py b/tests/data.py new file mode 100644 index 00000000..405a961b --- /dev/null +++ b/tests/data.py @@ -0,0 +1,54 @@ +""" +Lists of data files and directories to be shared by various tests. +""" + +from pathlib import Path +import json + +TESTDIR = Path(__file__).parent.parent / "samples" +SUBDIRS = ["acroform", "encryption", "scancode"] +BASEPDFS = list(TESTDIR.glob("*.pdf")) +for name in SUBDIRS: + BASEPDFS.extend((TESTDIR / name).glob("*.pdf")) +CONTRIB = TESTDIR / "contrib" +if CONTRIB.exists(): + BASEPDFS.extend(CONTRIB.glob("*.pdf")) + +ALLPDFS = list(BASEPDFS) +PLUMBERS = TESTDIR / "3rdparty" / "pdfplumber" / "tests" / "pdfs" +if PLUMBERS.exists(): + ALLPDFS.extend(PLUMBERS.glob("*.pdf")) +PDFJS = TESTDIR / "3rdparty" / "pdf.js" / "test" +try: + with open(PDFJS / "test_manifest.json") as infh: + manifest = json.load(infh) + for entry in manifest: + path = PDFJS / entry["file"] + if path.exists(): + ALLPDFS.append(path) +except FileNotFoundError: + pass + +PASSWORDS = { + "base.pdf": ["foo"], + "rc4-40.pdf": ["foo"], + "rc4-128.pdf": ["foo"], + "aes-128.pdf": ["foo"], + "aes-128-m.pdf": ["foo"], + "aes-256.pdf": ["foo"], + "aes-256-m.pdf": ["foo"], + "aes-256-r6.pdf": ["usersecret", "ownersecret"], +} +XFAILS = { + "empty.pdf", + # pdf.js accepts these... maybe some day we will but they are + # really rather broken. + "issue9418.pdf", + "bug1250079.pdf", + # FIXME: These can likely be fixed by correctly parsing CMaps + # (which should also be submitted as a PR to pdfminer.six) + "issue9915_reduced.pdf", + "issue2931.pdf", + "issue9534_reduced.pdf", + "issue18117.pdf", +} diff --git a/tests/test_document.py b/tests/test_document.py index 3cae4852..7ea8576e 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -13,6 +13,7 @@ from playa.exceptions import PDFSyntaxError from playa.parser import LIT from playa.utils import decode_text +from .data import CONTRIB TESTDIR = Path(__file__).parent.parent / "samples" @@ -54,14 +55,15 @@ def test_objects(): # assert objects[0].obj is doc[1] +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_page_labels(): - with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc: + with playa.open(CONTRIB / "pagelabels.pdf") as doc: labels = [label for _, label in zip(range(10), doc.page_labels)] assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"] assert doc.pages["iii"] is doc.pages[0] assert doc.pages["iv"] is doc.pages[1] assert doc.pages["2"] is doc.pages[3] - with playa.open("samples/2023-06-20-PV.pdf") as doc: + with playa.open(CONTRIB / "2023-06-20-PV.pdf") as doc: assert doc.pages["1"] is doc.pages[0] with pytest.raises(KeyError): _ = doc.pages["3"] @@ -69,8 +71,9 @@ def test_page_labels(): _ = doc.pages[2] +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_pages(): - with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc: + with playa.open(CONTRIB / "PSC_Station.pdf") as doc: page_objects = list(doc.pages) assert len(page_objects) == 15 objects = list(page_objects[2].contents) @@ -79,8 +82,9 @@ def test_pages(): assert b"diversit\xe9 " in tokens +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_names(): - with playa.open(TESTDIR / "contrib" / "issue-625-identity-cmap.pdf") as doc: + with playa.open(CONTRIB / "issue-625-identity-cmap.pdf") as doc: ef = NameTree(doc.names["EmbeddedFiles"]) # Because yes, they can be UTF-16... (the spec says nothing # about this but it appears some authoring tools assume that @@ -92,15 +96,17 @@ def test_names(): assert names == ["382901691/01_UBL.xml", "382901691/02_EAN_UCC.xml"] +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_dests(): - with playa.open(TESTDIR / "pdf.js" / "issue620f.pdf") as doc: + with playa.open(CONTRIB / "issue620f.pdf") as doc: names = [name for name, _ in doc.dests] assert names == ["Page.1", "Page.2"] +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_outlines(): with playa.open( - "samples/2023-04-06-ODJ et Résolutions-séance xtra 6 avril 2023.pdf" + CONTRIB / "2023-04-06-ODJ et Résolutions-séance xtra 6 avril 2023.pdf" ) as doc: titles = [o.title for o in doc.outlines] assert titles == [ @@ -112,8 +118,9 @@ def test_outlines(): ] +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_xobjects() -> None: - with playa.open(TESTDIR / "pdf.js" / "basicapi.pdf") as doc: + with playa.open(CONTRIB / "basicapi.pdf") as doc: page = doc.pages[0] xobj = next(page.xobjects) assert xobj.object_type == "xobject" diff --git a/tests/test_indirect_objects.py b/tests/test_indirect_objects.py index 8adac3d1..085b5264 100644 --- a/tests/test_indirect_objects.py +++ b/tests/test_indirect_objects.py @@ -2,7 +2,7 @@ import pytest -from playa.parser import LIT, ContentStream, IndirectObjectParser +from playa.parser import LIT, ContentStream, IndirectObjectParser, PDFSyntaxError TESTDIR = Path(__file__).parent.parent / "samples" @@ -67,6 +67,6 @@ def test_streams(): assert stream.rawdata == b"150 250 m\n150 350 l\nS\nA BUNCH OF EXTRA CRAP!!!\n" parser = IndirectObjectParser(DATA2, strict=True) - with pytest.raises(TypeError) as e: + with pytest.raises(PDFSyntaxError) as e: positions, objs = zip(*list(parser)) assert "Integer" in e diff --git a/tests/test_lazy_api.py b/tests/test_lazy_api.py index ca263005..52ebc80a 100644 --- a/tests/test_lazy_api.py +++ b/tests/test_lazy_api.py @@ -10,26 +10,13 @@ from playa.color import PREDEFINED_COLORSPACE, Color from playa.exceptions import PDFEncryptionError -TESTDIR = Path(__file__).parent.parent / "samples" -ALLPDFS = TESTDIR.glob("**/*.pdf") -PASSWORDS = { - "base.pdf": ["foo"], - "rc4-40.pdf": ["foo"], - "rc4-128.pdf": ["foo"], - "aes-128.pdf": ["foo"], - "aes-128-m.pdf": ["foo"], - "aes-256.pdf": ["foo"], - "aes-256-m.pdf": ["foo"], - "aes-256-r6.pdf": ["usersecret", "ownersecret"], -} -XFAILS = { - "bogus-stream-length.pdf", -} +from .data import TESTDIR, ALLPDFS, PASSWORDS, XFAILS, CONTRIB +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_content_objects(): """Ensure that we can produce all the basic content objects.""" - with playa.open(TESTDIR / "2023-06-20-PV.pdf", space="page") as pdf: + with playa.open(CONTRIB / "2023-06-20-PV.pdf", space="page") as pdf: page = pdf.pages[0] img = next(page.images) assert img.colorspace.name == "ICCBased" @@ -100,10 +87,11 @@ def test_uncoloured_tiling() -> None: assert path.gstate.ncolor == Color((0.5, 0.2, 1.0), "P1") +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_rotated_glyphs() -> None: """Verify that we (unlike pdfminer) properly calculate the bbox for rotated text.""" - with playa.open(TESTDIR / "contrib" / "issue_495_pdfobjref.pdf") as pdf: + with playa.open(CONTRIB / "issue_495_pdfobjref.pdf") as pdf: chars = [] for text in pdf.pages[0].texts: for glyph in text: @@ -114,7 +102,3 @@ def test_rotated_glyphs() -> None: width = x1 - x0 assert width > 6 assert "".join(chars) == "R18,00" - - -if __name__ == "__main__": - test_content_objects() diff --git a/tests/test_object_parser.py b/tests/test_object_parser.py index eda85135..ff59916a 100644 --- a/tests/test_object_parser.py +++ b/tests/test_object_parser.py @@ -161,9 +161,8 @@ def test_parser_miner(): def test_revlines() -> None: """Verify that we replicate the old revreadlines method.""" - expected = list(reversed([line for pos, line in EXPECTED2])) output = list(reverse_iter_lines(TESTDATA2)) - assert output == expected + assert output == list(reversed(EXPECTED2)) SIMPLE1 = b"""1 0 obj diff --git a/tests/test_open.py b/tests/test_open.py index 8a1910b3..f737f7b3 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -14,34 +14,27 @@ pdfminer = None # type: ignore import playa from playa.exceptions import PDFEncryptionError, PDFSyntaxError +from .data import TESTDIR, BASEPDFS, PASSWORDS, XFAILS, CONTRIB -TESTDIR = Path(__file__).parent.parent / "samples" -ALLPDFS = TESTDIR.glob("**/*.pdf") -PASSWORDS = { - "base.pdf": ["foo"], - "rc4-40.pdf": ["foo"], - "rc4-128.pdf": ["foo"], - "aes-128.pdf": ["foo"], - "aes-128-m.pdf": ["foo"], - "aes-256.pdf": ["foo"], - "aes-256-m.pdf": ["foo"], - "aes-256-r6.pdf": ["usersecret", "ownersecret"], -} +# We know pdfminer.six gives different output for these and we don't +# care (generally because of PLAYA's better rectangle detection and +# correct bboxes for rotated glyphs) PDFMINER_BUGS = { "issue-449-vertical.pdf", "issue_495_pdfobjref.pdf", "issue-1008-inline-ascii85.pdf", "rotated.pdf", -} -XFAILS = { - "bogus-stream-length.pdf", + "issue-1114-dedupe-chars.pdf", + "malformed-from-issue-932.pdf", + "mcid_example.pdf", } +# Only do "base" PDFs as we know pdfminer has issues with others @pytest.mark.skipif(pdfminer is None, reason="pdfminer.six is not installed") -@pytest.mark.parametrize("path", ALLPDFS, ids=str) +@pytest.mark.parametrize("path", BASEPDFS, ids=str) def test_open(path: Path) -> None: - """Open all the documents and compare with pdfplumber""" + """Open all the documents and compare with pdfminer""" if path.name in XFAILS: pytest.xfail("Intentionally corrupt file: %s" % path.name) from pdfminer.converter import PDFPageAggregator @@ -91,15 +84,17 @@ def convert_miner(layout): assert beach == miner +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_inline_data() -> None: - with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc: + with playa.open(CONTRIB / "issue-1008-inline-ascii85.pdf") as doc: page = doc.pages[0] items = list(page.layout) assert len(items) == 456 +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_redundant_h() -> None: - with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc: + with playa.open(CONTRIB / "issue-1008-inline-ascii85.pdf") as doc: page = doc.pages[0] rects = [item for item in page.layout if item["object_type"] == "rect"] assert len(rects) == 6 @@ -113,8 +108,9 @@ def test_multiple_contents() -> None: assert len(items) == 898 +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_xobjects() -> None: - with playa.open(TESTDIR / "pdf.js" / "basicapi.pdf") as doc: + with playa.open(CONTRIB / "basicapi.pdf") as doc: objs = [obj for obj in doc.layout if obj.get("xobjid")] assert objs assert objs[0]["xobjid"] == "XT5" @@ -141,16 +137,17 @@ def test_write_csv() -> None: # print(out.getvalue()) +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") def test_spaces() -> None: """Test different coordinate spaces.""" - with playa.open(TESTDIR / "pdfplumber" / "issue-1181.pdf", space="page") as doc: + with playa.open(CONTRIB / "issue-1181.pdf", space="page") as doc: page = doc.pages[0] page_box = next(iter(page)).bbox - with playa.open(TESTDIR / "pdfplumber" / "issue-1181.pdf", space="user") as doc: + with playa.open(CONTRIB / "issue-1181.pdf", space="user") as doc: page = doc.pages[0] user_box = next(iter(page)).bbox assert page_box[1] == pytest.approx(user_box[1] - page.mediabox[1]) - with playa.open(TESTDIR / "pdfplumber" / "issue-1181.pdf", space="screen") as doc: + with playa.open(CONTRIB / "issue-1181.pdf", space="screen") as doc: page = doc.pages[0] screen_box = next(iter(page)).bbox # BBoxes are normalied, so top is 1 for screen and 3 for page @@ -185,12 +182,7 @@ def test_glyph_offsets() -> None: def test_tiff_predictor() -> None: - with playa.open(TESTDIR / "contrib" / "test_pdf_with_tiff_predictor.pdf") as doc: + with playa.open(TESTDIR / "test_pdf_with_tiff_predictor.pdf") as doc: image = next(doc.pages[0].images) # Decoded TIFF: 600 x 600 + a header assert len(image.stream.buffer) == 360600 - - -def test_bogus_stream_length() -> None: - with pytest.raises(PDFSyntaxError): - _ = playa.open(TESTDIR / "bogus-stream-length.pdf") diff --git a/tests/test_structtree.py b/tests/test_structtree.py index 3c4f2284..a9cf32bf 100644 --- a/tests/test_structtree.py +++ b/tests/test_structtree.py @@ -1,76 +1,75 @@ import re -import unittest -from pathlib import Path +import pytest import playa +from .data import CONTRIB, TESTDIR -TESTDIR = Path(__file__).parent.parent / "samples" +def test_structure_tree_class() -> None: + with playa.open(TESTDIR / "image_structure.pdf") as pdf: + stree = pdf.pages[0].structtree + doc_elem = next(iter(stree)) + assert [k.type for k in doc_elem] == ["P", "P", "Figure"] -class TestClass(unittest.TestCase): - """Test the underlying Structure tree class""" - def test_structure_tree_class(self) -> None: - with playa.open(TESTDIR / "image_structure.pdf") as pdf: - stree = pdf.pages[0].structtree - doc_elem = next(iter(stree)) - assert [k.type for k in doc_elem] == ["P", "P", "Figure"] +def test_find_all_tree() -> None: + """ + Test find_all() and find() on trees + """ + with playa.open(TESTDIR / "image_structure.pdf") as pdf: + stree = pdf.pages[0].structtree + figs = list(stree.find_all("Figure")) + assert len(figs) == 1 + fig = stree.find("Figure") + assert fig == figs[0] + assert stree.find("Fogure") is None + figs = list(stree.find_all(re.compile(r"Fig.*"))) + assert len(figs) == 1 + figs = list(stree.find_all(lambda x: x.type == "Figure")) + assert len(figs) == 1 + figs = list(stree.find_all("Foogure")) + assert len(figs) == 0 + figs = list(stree.find_all(re.compile(r"Fog.*"))) + assert len(figs) == 0 + figs = list(stree.find_all(lambda x: x.type == "Flogger")) + assert len(figs) == 0 - def test_find_all_tree(self) -> None: - """ - Test find_all() and find() on trees - """ - with playa.open(TESTDIR / "image_structure.pdf") as pdf: - stree = pdf.pages[0].structtree - figs = list(stree.find_all("Figure")) - assert len(figs) == 1 - fig = stree.find("Figure") - assert fig == figs[0] - assert stree.find("Fogure") is None - figs = list(stree.find_all(re.compile(r"Fig.*"))) - assert len(figs) == 1 - figs = list(stree.find_all(lambda x: x.type == "Figure")) - assert len(figs) == 1 - figs = list(stree.find_all("Foogure")) - assert len(figs) == 0 - figs = list(stree.find_all(re.compile(r"Fog.*"))) - assert len(figs) == 0 - figs = list(stree.find_all(lambda x: x.type == "Flogger")) - assert len(figs) == 0 - def test_find_all_element(self) -> None: - """ - Test find_all() and find() on elements - """ - with playa.open(TESTDIR / "pdf_structure.pdf") as pdf: - stree = pdf.structtree - for list_elem in stree.find_all("L"): - items = list(list_elem.find_all("LI")) - assert items - for item in items: - body = list(item.find_all("LBody")) - assert body - body1 = item.find("LBody") - assert body1 == body[0] - assert item.find("Loonie") is None +def test_find_all_element() -> None: + """ + Test find_all() and find() on elements + """ + with playa.open(TESTDIR / "pdf_structure.pdf") as pdf: + stree = pdf.structtree + for list_elem in stree.find_all("L"): + items = list(list_elem.find_all("LI")) + assert items + for item in items: + body = list(item.find_all("LBody")) + assert body + body1 = item.find("LBody") + assert body1 == body[0] + assert item.find("Loonie") is None - def test_all_mcids(self) -> None: - """ - Test all_mcids() - """ - with playa.open(TESTDIR / "2023-06-20-PV.pdf") as pdf: - # Make sure we can get them with page numbers - stree = pdf.structtree - sect = next(stree.find_all("Sect")) - mcids = list(sect.all_mcids()) - page_indices = set(page for page, mcid in mcids) - assert 0 in page_indices - assert 1 in page_indices - stree = pdf.pages[1].structtree - sect = next(stree.find_all("Sect")) - mcids = list(sect.all_mcids()) - page_indices = set(page for page, mcid in mcids) - assert page_indices == {1} - for p in sect.find_all("P"): - assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids) +@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present") +def test_all_mcids() -> None: + """ + Test all_mcids() + """ + with playa.open(CONTRIB / "2023-06-20-PV.pdf") as pdf: + # Make sure we can get them with page numbers + stree = pdf.structtree + sect = next(stree.find_all("Sect")) + mcids = list(sect.all_mcids()) + page_indices = set(page for page, mcid in mcids) + assert 0 in page_indices + assert 1 in page_indices + + stree = pdf.pages[1].structtree + sect = next(stree.find_all("Sect")) + mcids = list(sect.all_mcids()) + page_indices = set(page for page, mcid in mcids) + assert page_indices == {1} + for p in sect.find_all("P"): + assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)