diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a0a63c3c..8ebbde55 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,6 +10,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: true
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..b0d1051e
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "tests/3rdparty/pdfplumber"]
+	path = samples/3rdparty/pdfplumber
+	url = https://github.com/jsvine/pdfplumber.git
+[submodule "tests/3rdparty/pdf.js"]
+	path = samples/3rdparty/pdf.js
+	url = https://github.com/mozilla/pdf.js.git
diff --git a/playa/cmapdb.py b/playa/cmapdb.py
index 71dcf88c..3ce8c097 100644
--- a/playa/cmapdb.py
+++ b/playa/cmapdb.py
@@ -209,6 +209,8 @@ def _load_data(cls, name: str) -> Any:
     def get_cmap(cls, name: str) -> CMapBase:
         if name == "Identity-H":
             return IdentityCMap(WMode=0)
+        elif name == "Adobe-Identity-UCS":
+            return IdentityCMap(WMode=0)  # FIXME: WMode???
         elif name == "Identity-V":
             return IdentityCMap(WMode=1)
         elif name == "OneByteIdentityH":
diff --git a/playa/color.py b/playa/color.py
index aca0f951..8ceab4a2 100644
--- a/playa/color.py
+++ b/playa/color.py
@@ -1,8 +1,7 @@
 from typing import Dict, NamedTuple, Union, Tuple
 
-from playa.exceptions import PDFInterpreterError
 from playa.parser import LIT, PDFObject, PSLiteral
-from playa.pdftypes import num_value, list_value, literal_name, stream_value
+from playa.pdftypes import num_value, list_value, literal_name, stream_value, resolve1
 
 LITERAL_DEVICE_GRAY = LIT("DeviceGray")
 LITERAL_DEVICE_RGB = LIT("DeviceRGB")
@@ -37,15 +36,12 @@ class ColorSpace(NamedTuple):
     spec: PDFObject = None
 
     def make_color(self, *components) -> Color:
-        if len(components) != self.ncomponents:
-            raise PDFInterpreterError(
-                "%s requires %d components, got %d!"
-                % (self.name, self.ncomponents, len(components))
-            )
-        nc = self.ncomponents
         pattern = None
-        if isinstance(components[-1], PSLiteral):
+        nc = self.ncomponents
+        if components and isinstance(components[-1], PSLiteral):
             pattern = components[-1].name
+            components = components[:-1]
+            # Remove the pattern we added to ncomponents
             nc -= 1
         cc = []
         for x in components[:nc]:
@@ -93,14 +89,14 @@ def get_colorspace(
                 raise ValueError(
                     "Underlying colour space cannot be /Pattern: %r" % (spec,)
                 )
-            underlying = get_colorspace(spec[1])
+            underlying = get_colorspace(resolve1(spec[1]))
             if underlying is None:
                 raise ValueError("Unrecognized underlying colour space: %r", (spec,))
             # Not super important what we call it but we need to know it
             # has N+1 "components" (the last one being the pattern)
             return ColorSpace(name, underlying.ncomponents + 1, spec)
         else:
-            cs = PREDEFINED_COLORSPACE.get(literal_name(spec[0]))
+            cs = PREDEFINED_COLORSPACE.get(literal_name(resolve1(spec[0])))
             if cs is None:
                 return None
             return ColorSpace(cs.name, cs.ncomponents, spec)
diff --git a/playa/document.py b/playa/document.py
index b6436208..96817733 100644
--- a/playa/document.py
+++ b/playa/document.py
@@ -8,13 +8,11 @@
 import mmap
 import re
 import struct
-from collections import deque
 from hashlib import md5, sha256, sha384, sha512
 from typing import (
     Any,
     BinaryIO,
     Callable,
-    Deque,
     Dict,
     Iterable,
     Iterator,
@@ -50,9 +48,9 @@
     schema as page_schema,
 )
 from playa.parser import (
-    KEYWORD_OBJ,
     KEYWORD_TRAILER,
     KEYWORD_XREF,
+    KEYWORD_OBJ,
     LIT,
     IndirectObject,
     IndirectObjectParser,
@@ -211,6 +209,9 @@ def __repr__(self) -> str:
     def _load(self, parser: IndirectObjectParser) -> None:
         parser.seek(0)
         parser.reset()
+        doc = None if parser.doc is None else parser.doc()
+        if doc is None:
+            raise RuntimeError("Document no longer exists!")
         # Get all the objects
         for pos, obj in parser:
             self.offsets[obj.objid] = XRefPos(None, pos, obj.genno)
@@ -225,9 +226,6 @@ def _load(self, parser: IndirectObjectParser) -> None:
                 except KeyError:
                     log.warning("N is not defined in object stream: %r", stream)
                     n = 0
-                doc = None if parser.doc is None else parser.doc()
-                if doc is None:
-                    raise RuntimeError("Document no longer exists!")
                 parser1 = ObjectParser(stream.buffer, doc)
                 objs: List = [obj for _, obj in parser1]
                 # FIXME: This is choplist
@@ -244,10 +242,17 @@ def _load(self, parser: IndirectObjectParser) -> None:
                 _, dic = s2
                 self.trailer.update(dict_value(dic))
                 log.debug("trailer=%r", self.trailer)
-                break
+                return
             s1 = s2
-        else:
-            log.warning("b'trailer' not found in document")
+        # If not, then try harder
+        for pos, line in reverse_iter_lines(parser.buffer):
+            line = line.strip()
+            if line == b"trailer":
+                _, trailer = next(ObjectParser(parser.buffer, doc, pos + len(b"trailer")))
+                self.trailer.update(trailer)
+                log.debug("trailer=%r", self.trailer)
+                return
+        log.warning("b'trailer' not found in document")
 
     @property
     def objids(self) -> Iterable[int]:
@@ -508,7 +513,8 @@ def init_params(self) -> None:
             raise PDFEncryptionError(error_msg)
         self.cfm = {}
         for k, v in self.cf.items():
-            f = self.get_cfm(literal_name(v["CFM"]))
+            dictv = dict_value(v)
+            f = self.get_cfm(literal_name(dictv["CFM"]))
             if f is None:
                 error_msg = "Unknown crypt filter method: param=%r" % self.param
                 raise PDFEncryptionError(error_msg)
@@ -833,13 +839,14 @@ def __init__(
         except PDFSyntaxError:
             log.warning("PDF header not found, will try to read the file anyway")
             self.pdf_version = "UNKNOWN"
+        # Make sure we read the whole file if we need to read the file!
+        fp.seek(0, 0)
         try:
             self.buffer: Union[bytes, mmap.mmap] = mmap.mmap(
                 fp.fileno(), 0, access=mmap.ACCESS_READ
             )
         except io.UnsupportedOperation:
             log.warning("mmap not supported on %r, reading document into memory", fp)
-            fp.seek(0, 0)
             self.buffer = fp.read()
         except ValueError:
             raise
@@ -853,7 +860,7 @@ def __init__(
         try:
             pos = self._find_xref()
             self._read_xref_from(pos, self.xrefs)
-        except (ValueError, IndexError) as e:
+        except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
             log.debug("Using fallback XRef parsing: %s", e)
             newxref = XRefFallback(self.parser)
             self.xrefs.append(newxref)
@@ -874,7 +881,10 @@ def __init__(
                 self.encryption = (id_value, dict_value(trailer["Encrypt"]))
                 self._initialize_password(password)
             if "Info" in trailer:
-                self.info.append(dict_value(trailer["Info"]))
+                try:
+                    self.info.append(dict_value(trailer["Info"]))
+                except TypeError:
+                    log.warning("Info is a broken reference (incorrect xref table?)")
             if "Root" in trailer:
                 # Every PDF file must have exactly one /Root dictionary.
                 try:
@@ -982,30 +992,30 @@ def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
         self.parser.seek(pos)
         try:
             _, obj = next(self.parser)
-        except (ValueError, IndexError) as e:
+            if obj.objid != objid:
+                raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
+        except (ValueError, IndexError, PDFSyntaxError) as e:
             log.warning(
                 "Indirect object %d not found at position %d: %r", objid, pos, e
             )
-            # Hack around malformed pdf files where the offset in the
+            # In case of malformed pdf files where the offset in the
             # xref table doesn't point exactly at the object
-            # definition (probably more frequent than you think).
-            # Back up a bit, then parse forward until we find the right
-            # object. Fixes
-            # https://github.com/pdfminer/pdfminer.six/issues/56
-            tokenizer = Lexer(self.buffer, max(0, pos - 16))
-            q: Deque[int] = deque([], 3)
-            while True:
-                try:
-                    (pos, token) = next(tokenizer)
-                except StopIteration:
-                    raise PDFSyntaxError(
-                        f"Indirect object {objid!r} not found at or after position {pos}"
-                    )
-                q.append(pos)
-                if len(q) == 3 and token is KEYWORD_OBJ:
-                    break
-            log.debug("seeking to %r", q[0])
-            self.parser.seek(q[0])
+            # definition (probably more frequent than you think), just
+            # use a regular expression to find the object because we
+            # can do that.
+            realpos = -1
+            lastgen = -1
+            for m in re.finditer(rb"%d\s+(\d+)\s+obj" % objid, self.buffer):
+                genno = int(m.group(1))
+                if genno > lastgen:
+                    lastgen = genno
+                    realpos = m.start(0)
+            if realpos == -1:
+                raise PDFSyntaxError(
+                    f"Indirect object {objid!r} not found in document"
+                ) from e
+            log.debug("found object (%r) seeking to %r", m.group(0), realpos)
+            self.parser.seek(realpos)
             (_, obj) = next(self.parser)
         if obj.objid != objid:
             raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
@@ -1038,7 +1048,12 @@ def __getitem__(self, objid: int) -> Any:
                     else:
                         obj = self._getobj_parse(index, objid)
                     break
-                except (StopIteration, PDFSyntaxError):
+                # FIXME: We might not actually want to catch these...
+                except StopIteration:
+                    log.debug("EOF when searching for object %d", objid)
+                    continue
+                except PDFSyntaxError as e:
+                    log.debug("Syntax error when searching for object %d: %s", objid, e)
                     continue
             if obj is None:
                 raise IndexError(f"Object with ID {objid} not found")
@@ -1050,14 +1065,14 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
         if objid and objid in self._cached_fonts:
             return self._cached_fonts[objid]
         log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
-        if spec["Type"] is not LITERAL_FONT:
+        if spec.get("Type") is not LITERAL_FONT:
             log.warning("Font specification Type is not /Font: %r", spec)
         # Create a Font object.
         if "Subtype" in spec:
             subtype = literal_name(spec["Subtype"])
         else:
             log.warning("Font specification Subtype is not specified: %r", spec)
-            subtype = "Type1"
+            subtype = ""
         if subtype in ("Type1", "MMType1"):
             # Type1 Font
             font: Font = Type1Font(spec)
@@ -1075,6 +1090,7 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
             dfonts = list_value(spec["DescendantFonts"])
             assert dfonts
             subspec = dict_value(dfonts[0]).copy()
+            # FIXME: Bad tightly coupled with internals of CIDFont
             for k in ("Encoding", "ToUnicode"):
                 if k in spec:
                     subspec[k] = resolve1(spec[k])
@@ -1255,19 +1271,27 @@ def _find_xref(self) -> int:
         """Internal function used to locate the first XRef."""
         # search the last xref table by scanning the file backwards.
         prev = b""
-        # FIXME: This will scan *the whole file* looking for an xref
-        # table, it should maybe give up sooner?
-        for line in reverse_iter_lines(self.buffer):
+        for pos, line in reverse_iter_lines(self.buffer):
             line = line.strip()
             log.debug("find_xref: %r", line)
             if line == b"startxref":
                 log.debug("xref found: pos=%r", prev)
                 if not prev.isdigit():
-                    raise ValueError(f"Invalid xref position: {prev!r}")
+                    log.warning("Invalid startxref position: %r", prev)
+                    continue
                 start = int(prev)
                 if not start >= 0:
-                    raise ValueError(f"Invalid negative xref position: {start}")
+                    log.warning("Invalid negative startxref position: %d", start)
+                    continue
+                elif start > pos:
+                    log.warning("Invalid startxref position (> %d): %d", pos, start)
+                    continue
                 return start
+            elif line == b"xref":
+                return pos
+            elif line == b"endobj":
+                # Okay, we're probably not in Kansas anymore...
+                break
             if line:
                 prev = line
         raise ValueError("No xref table found at end of file")
@@ -1285,15 +1309,23 @@ def _read_xref_from(
         except StopIteration:
             raise ValueError("Unexpected EOF at {start}")
         log.debug("read_xref_from: start=%d, token=%r", start, token)
-        if isinstance(token, int):
-            # XRefStream: PDF-1.5
-            self.parser.seek(pos)
-            self.parser.reset()
-            xref: XRef = XRefStream(self.parser)
-        else:
-            if token is KEYWORD_XREF:
-                parser.nextline()
+        if token is KEYWORD_XREF:
+            parser.nextline()
             xref = XRefTable(parser)
+        else:
+            # It might be an XRefStream, if this is an indirect object...
+            _, token2 = parser.nexttoken()
+            _, token3 = parser.nexttoken()
+            if token3 is KEYWORD_OBJ:
+                # XRefStream: PDF-1.5
+                self.parser.seek(pos)
+                self.parser.reset()
+                xref: XRef = XRefStream(self.parser)
+            else:
+                # Well, maybe it's an XRef table without "xref" (but
+                # probably not)
+                parser.seek(pos)
+                xref = XRefTable(parser)
         xrefs.append(xref)
         trailer = xref.trailer
         # For hybrid-reference files, an additional set of xrefs as a
@@ -1319,10 +1351,10 @@ def __init__(self, doc: Document):
         self._pages = []
         self._labels: Dict[str, Page] = {}
         try:
-            itor = doc._get_page_objects()
-        except KeyError:
-            itor = doc._get_pages_from_xrefs()
-        for page_idx, ((objid, properties), label) in enumerate(zip(itor, page_labels)):
+            page_objects = list(doc._get_page_objects())
+        except (KeyError, IndexError):
+            page_objects = list(doc._get_pages_from_xrefs())
+        for page_idx, ((objid, properties), label) in enumerate(zip(page_objects, page_labels)):
             page = Page(doc, objid, properties, label, page_idx, doc.space)
             self._pages.append(page)
             if label is not None:
diff --git a/playa/font.py b/playa/font.py
index 3399b6ed..018b09a4 100644
--- a/playa/font.py
+++ b/playa/font.py
@@ -50,7 +50,15 @@
     resolve_all,
     stream_value,
 )
-from playa.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
+from playa.utils import (
+    Matrix,
+    Point,
+    Rect,
+    apply_matrix_norm,
+    choplist,
+    nunpack,
+    decode_text,
+)
 
 log = logging.getLogger(__name__)
 
@@ -1032,11 +1040,13 @@ def __init__(
             log.warning("Font spec is missing BaseFont: %r", spec)
             self.basefont = "unknown"
         self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
-        cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
-            "latin1",
+        # These are *supposed* to be ASCII (PDF 1.7 section 9.7.3),
+        # but for whatever reason they are sometimes UTF-16BE
+        cid_registry = decode_text(
+            resolve1(self.cidsysteminfo.get("Registry", b"unknown"))
         )
-        cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
-            "latin1",
+        cid_ordering = decode_text(
+            resolve1(self.cidsysteminfo.get("Ordering", b"unknown"))
         )
         self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
         self.cmap: CMapBase = self.get_cmap_from_spec(spec)
@@ -1051,11 +1061,18 @@ def __init__(
             self.fontfile = stream_value(descriptor.get("FontFile2"))
             ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.buffer))
         self.unicode_map: Optional[UnicodeMap] = None
+        # FIXME: This is magical and means that we are not actually a
+        # CIDFont but really a Type0 font.
         if "ToUnicode" in spec:
             if isinstance(spec["ToUnicode"], ContentStream):
                 strm = stream_value(spec["ToUnicode"])
                 self.unicode_map = parse_tounicode(strm.buffer)
-            else:
+            if isinstance(spec["Encoding"], ContentStream):
+                strm = stream_value(spec["Encoding"])
+                # FIXME: it's not a tounicode, but it plays one on TV
+                # _ = parse_tounicode(strm.buffer)
+
+            if self.unicode_map is None:
                 cmap_name = literal_name(spec["ToUnicode"])
                 encoding = literal_name(spec["Encoding"])
                 if (
diff --git a/playa/page.py b/playa/page.py
index 62f8a7c4..e2448b63 100644
--- a/playa/page.py
+++ b/playa/page.py
@@ -34,8 +34,8 @@
     get_colorspace,
 )
 from playa.exceptions import (
-    PDFInterpreterError,
     PDFUnicodeNotDefined,
+    PDFSyntaxError,
 )
 from playa.font import Font
 
@@ -67,7 +67,6 @@
     get_transformed_bound,
     make_compat_bytes,
     mult_matrix,
-    parse_rect,
     normalize_rect,
     translate_matrix,
 )
@@ -98,6 +97,17 @@ def Object(*args, **kwargs): ...
 DeviceSpace = Literal["page", "screen", "user"]
 
 
+# FIXME: This should go in utils/pdftypes but there are circular imports
+def parse_rect(o: PDFObject) -> Rect:
+    try:
+        (x0, y0, x1, y1) = (num_value(x) for x in list_value(o))
+        return x0, y0, x1, y1
+    except ValueError:
+        raise ValueError("Could not parse rectangle %r" % (o,))
+    except TypeError:
+        raise PDFSyntaxError("Rectangle contains non-numeric values")
+
+
 class Page:
     """An object that holds the information about a page.
 
@@ -137,38 +147,39 @@ def __init__(
         self.page_idx = page_idx
         self.space = space
         self.lastmod = resolve1(self.attrs.get("LastModified"))
-        self.resources: Dict[str, PDFObject] = dict_value(
-            self.attrs.get("Resources", {})
-        )
-        if "MediaBox" in self.attrs:
-            self.mediabox = normalize_rect(
-                parse_rect(resolve1(val) for val in resolve1(self.attrs["MediaBox"]))
+        try:
+            self.resources: Dict[str, PDFObject] = dict_value(
+                self.attrs.get("Resources")
             )
+        except TypeError:
+            log.warning("Resources missing or invalid from Page id %d", pageid)
+            self.resources = {}
+        if "MediaBox" in self.attrs:
+            self.mediabox = normalize_rect(parse_rect(self.attrs["MediaBox"]))
         else:
             log.warning(
-                "MediaBox missing from /Page (and not inherited),"
-                " defaulting to US Letter (612x792)"
+                "MediaBox missing from Page id %d (and not inherited),"
+                " defaulting to US Letter (612x792)", pageid
             )
             self.mediabox = (0, 0, 612, 792)
         self.cropbox = self.mediabox
         if "CropBox" in self.attrs:
             try:
-                self.cropbox = normalize_rect(
-                    parse_rect(resolve1(val) for val in resolve1(self.attrs["CropBox"]))
-                )
+                self.cropbox = normalize_rect(parse_rect(self.attrs["CropBox"]))
             except ValueError:
                 log.warning("Invalid CropBox in /Page, defaulting to MediaBox")
 
         self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
         self.annots = self.attrs.get("Annots")
         self.beads = self.attrs.get("B")
-        if "Contents" in self.attrs:
-            self._contents: List[PDFObject] = resolve1(self.attrs["Contents"])
-            assert self._contents is not None
-            if not isinstance(self._contents, list):
-                self._contents = [self._contents]
-        else:
+        contents = resolve1(self.attrs.get("Contents"))
+        if contents is None:
             self._contents = []
+        else:
+            if isinstance(contents, list):
+                self._contents = contents
+            else:
+                self._contents = [contents]
 
     @property
     def streams(self) -> Iterator[ContentStream]:
@@ -737,8 +748,12 @@ def init_resources(self, page: Page, resources: Dict) -> None:
                     objid = None
                     if isinstance(spec, ObjRef):
                         objid = spec.objid
-                    spec = dict_value(spec)
-                    self.fontmap[fontid] = doc.get_font(objid, spec)
+                    try:
+                        spec = dict_value(spec)
+                        self.fontmap[fontid] = doc.get_font(objid, spec)
+                    except TypeError:
+                        log.warning("Broken/missing font spec for %r", fontid)
+                        self.fontmap[fontid] = doc.get_font(objid, {})
             elif k == "ColorSpace":
                 for csid, spec in dict_value(v).items():
                     colorspace = get_colorspace(resolve1(spec), csid)
@@ -1208,11 +1223,11 @@ def __iter__(self) -> Iterator[LayoutDict]:
                         if len(args) == nargs:
                             gen = method(*args)
                         else:
-                            error_msg = (
-                                "Insufficient arguments (%d) for operator: %r"
-                                % (len(args), obj)
+                            log.warning(
+                                "Insufficient arguments (%d) for operator: %r",
+                                len(args),
+                                obj,
                             )
-                            raise PDFInterpreterError(error_msg)
                     else:
                         log.debug("exec: %r", obj)
                         gen = method()
@@ -2121,17 +2136,18 @@ def __iter__(self) -> Iterator[ContentObject]:
                         if len(args) == nargs:
                             gen = method(*args)
                         else:
-                            error_msg = (
-                                "Insufficient arguments (%d) for operator: %r"
-                                % (len(args), obj)
+                            log.warning(
+                                "Insufficient arguments (%d) for operator: %r",
+                                len(args),
+                                obj,
                             )
-                            raise PDFInterpreterError(error_msg)
                     else:
                         log.debug("exec: %r", obj)
                         gen = method()
                     if gen is not None:
                         yield from gen
                 else:
+                    # TODO: This can get very verbose
                     log.warning("Unknown operator: %r", obj)
             else:
                 self.push(obj)
@@ -2427,6 +2443,9 @@ def do_Do(self, xobjid_arg: PDFObject) -> Iterator[ContentObject]:
         except KeyError:
             log.debug("Undefined xobject id: %r", xobjid)
             return
+        except TypeError as e:
+            log.debug("Empty or invalid xobject with id %r: %s", xobjid, e)
+            return
         log.debug("Processing xobj: %r", xobj)
         subtype = xobj.get("Subtype")
         if subtype is LITERAL_FORM and "BBox" in xobj:
diff --git a/playa/parser.py b/playa/parser.py
index 964e2320..ce667414 100644
--- a/playa/parser.py
+++ b/playa/parser.py
@@ -77,7 +77,7 @@
 }
 
 
-def reverse_iter_lines(buffer: Union[bytes, mmap.mmap]) -> Iterator[bytes]:
+def reverse_iter_lines(buffer: Union[bytes, mmap.mmap]) -> Iterator[Tuple[int, bytes]]:
     """Iterate backwards over lines starting at the current position.
 
     This is used to locate the trailers at the end of a file.
@@ -87,10 +87,9 @@ def reverse_iter_lines(buffer: Union[bytes, mmap.mmap]) -> Iterator[bytes]:
         nidx = buffer.rfind(b"\n", 0, pos)
         ridx = buffer.rfind(b"\r", 0, pos)
         best = max(nidx, ridx)
+        yield best + 1, buffer[best + 1 : endline]
         if best == -1:
-            yield buffer[:endline]
             break
-        yield buffer[best + 1 : endline]
         endline = best + 1
         pos = best
         if pos > 0 and buffer[pos - 1 : pos + 1] == b"\r\n":
@@ -561,6 +560,7 @@ def __init__(
         strict: bool = False,
     ) -> None:
         self._parser = ObjectParser(data, doc)
+        self.buffer = data
         self.trailer: List[Tuple[int, Union[PDFObject, ContentStream]]] = []
         self.doc = None if doc is None else weakref.ref(doc)
         self.strict = strict
@@ -580,8 +580,13 @@ def __next__(self) -> Tuple[int, IndirectObject]:
                 (_, obj) = self.trailer.pop()
                 (_, genno) = self.trailer.pop()
                 (pos, objid) = self.trailer.pop()
-                objid = int_value(objid)
-                genno = int_value(genno)
+                try:
+                    objid = int_value(objid)
+                    genno = int_value(genno)
+                except TypeError as e:
+                    raise PDFSyntaxError(
+                        f"Object numbers must be integers, got {objid!r} {genno!r}"
+                    ) from e
                 # ContentStream is *special* and needs these
                 # internally for decryption.
                 if isinstance(obj, ContentStream):
@@ -652,7 +657,7 @@ def __next__(self) -> Tuple[int, IndirectObject]:
                         log.debug("After stream data: %r %r", linepos, line)
                         if line == b"":  # Means EOF
                             log.warning(
-                                "Incorrect legnth for stream, no 'endstream' found"
+                                "Incorrect length for stream, no 'endstream' found"
                             )
                             break
                 doc = None if self.doc is None else self.doc()
diff --git a/playa/utils.py b/playa/utils.py
index cb3b7f41..accc7a60 100644
--- a/playa/utils.py
+++ b/playa/utils.py
@@ -2,7 +2,6 @@
 
 import string
 from typing import (
-    Any,
     Iterable,
     Iterator,
     List,
@@ -11,8 +10,6 @@
     Union,
 )
 
-from playa.exceptions import PDFSyntaxError
-
 
 def make_compat_bytes(in_str: str) -> bytes:
     """Converts to bytes, encoding to unicode."""
@@ -180,16 +177,6 @@ def apply_png_predictor(
 MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
 
 
-def parse_rect(o: Any) -> Rect:
-    try:
-        (x0, y0, x1, y1) = o
-        return float(x0), float(y0), float(x1), float(y1)
-    except ValueError:
-        raise ValueError("Could not parse rectangle")
-    except TypeError:
-        raise PDFSyntaxError("Rectangle contains non-numeric values")
-
-
 def normalize_rect(r: Rect) -> Rect:
     (x0, y0, x1, y1) = r
     if x1 < x0:
diff --git a/pyproject.toml b/pyproject.toml
index 6fc6c598..186bd3e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,13 @@ playa = "playa.cli:main"
 [tool.hatch.version]
 path = "playa/__init__.py"
 
+[tool.hatch.build.targets.sdist]
+exclude = [
+  "/.github",
+  "/samples/contrib",
+  "/samples/3rdparty",
+]
+
 [tool.hatch.build.targets.wheel]
 packages = ["playa"]
 
@@ -53,6 +60,9 @@ config-path = "none"  # Disable hatch's unreasonable ruff defaults
 [tool.ruff.lint.flake8-tidy-imports]
 ban-relative-imports = "all"
 
+[tool.ruff]
+exclude = ["samples/3rdparty"] # why no leading slash?
+
 [tool.pytest.ini_options]
 testpaths = [ "tests" ]
 
diff --git a/samples/3rdparty/pdf.js b/samples/3rdparty/pdf.js
new file mode 160000
index 00000000..d4489531
--- /dev/null
+++ b/samples/3rdparty/pdf.js
@@ -0,0 +1 @@
+Subproject commit d4489531668b172a5ffd4ac6a29c3eb856af79a6
diff --git a/samples/3rdparty/pdfplumber b/samples/3rdparty/pdfplumber
new file mode 160000
index 00000000..e921ea74
--- /dev/null
+++ b/samples/3rdparty/pdfplumber
@@ -0,0 +1 @@
+Subproject commit e921ea748b245b9686540fb89b0da0c4125c31ec
diff --git a/samples/bogus-stream-length.pdf b/samples/bogus-stream-length.pdf
deleted file mode 100644
index 12a4c5dc..00000000
Binary files a/samples/bogus-stream-length.pdf and /dev/null differ
diff --git "a/samples/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf" "b/samples/contrib/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf"
similarity index 100%
rename from "samples/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf"
rename to "samples/contrib/2023-04-06-ODJ et R\303\251solutions-s\303\251ance xtra 6 avril 2023.pdf"
diff --git a/samples/2023-06-20-PV.pdf b/samples/contrib/2023-06-20-PV.pdf
similarity index 100%
rename from samples/2023-06-20-PV.pdf
rename to samples/contrib/2023-06-20-PV.pdf
diff --git a/samples/pdf.js/basicapi.pdf b/samples/contrib/basicapi.pdf
similarity index 100%
rename from samples/pdf.js/basicapi.pdf
rename to samples/contrib/basicapi.pdf
diff --git a/samples/pdfplumber/issue-1181.pdf b/samples/contrib/issue-1181.pdf
similarity index 100%
rename from samples/pdfplumber/issue-1181.pdf
rename to samples/contrib/issue-1181.pdf
diff --git a/samples/pdf.js/issue620f.pdf b/samples/contrib/issue620f.pdf
similarity index 100%
rename from samples/pdf.js/issue620f.pdf
rename to samples/contrib/issue620f.pdf
diff --git a/samples/contrib/test_pdf_with_tiff_predictor.pdf b/samples/test_pdf_with_tiff_predictor.pdf
similarity index 100%
rename from samples/contrib/test_pdf_with_tiff_predictor.pdf
rename to samples/test_pdf_with_tiff_predictor.pdf
diff --git a/tests/3rdparty/test_pdf_js.py b/tests/3rdparty/test_pdf_js.py
deleted file mode 100644
index 57a835ba..00000000
--- a/tests/3rdparty/test_pdf_js.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""
-Run pdf.js testsuite if present.
-"""
diff --git a/tests/3rdparty/test_pdfplumber.py b/tests/3rdparty/test_pdfplumber.py
deleted file mode 100644
index 5fd38217..00000000
--- a/tests/3rdparty/test_pdfplumber.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""
-Test pdfplumber (using playa branch) if present.
-"""
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/data.py b/tests/data.py
new file mode 100644
index 00000000..405a961b
--- /dev/null
+++ b/tests/data.py
@@ -0,0 +1,54 @@
+"""
+Lists of data files and directories to be shared by various tests.
+"""
+
+from pathlib import Path
+import json
+
+TESTDIR = Path(__file__).parent.parent / "samples"
+SUBDIRS = ["acroform", "encryption", "scancode"]
+BASEPDFS = list(TESTDIR.glob("*.pdf"))
+for name in SUBDIRS:
+    BASEPDFS.extend((TESTDIR / name).glob("*.pdf"))
+CONTRIB = TESTDIR / "contrib"
+if CONTRIB.exists():
+    BASEPDFS.extend(CONTRIB.glob("*.pdf"))
+
+ALLPDFS = list(BASEPDFS)
+PLUMBERS = TESTDIR / "3rdparty" / "pdfplumber" / "tests" / "pdfs"
+if PLUMBERS.exists():
+    ALLPDFS.extend(PLUMBERS.glob("*.pdf"))
+PDFJS = TESTDIR / "3rdparty" / "pdf.js" / "test"
+try:
+    with open(PDFJS / "test_manifest.json") as infh:
+        manifest = json.load(infh)
+    for entry in manifest:
+        path = PDFJS / entry["file"]
+        if path.exists():
+            ALLPDFS.append(path)
+except FileNotFoundError:
+    pass
+
+PASSWORDS = {
+    "base.pdf": ["foo"],
+    "rc4-40.pdf": ["foo"],
+    "rc4-128.pdf": ["foo"],
+    "aes-128.pdf": ["foo"],
+    "aes-128-m.pdf": ["foo"],
+    "aes-256.pdf": ["foo"],
+    "aes-256-m.pdf": ["foo"],
+    "aes-256-r6.pdf": ["usersecret", "ownersecret"],
+}
+XFAILS = {
+    "empty.pdf",
+    # pdf.js accepts these... maybe some day we will but they are
+    # really rather broken.
+    "issue9418.pdf",
+    "bug1250079.pdf",
+    # FIXME: These can likely be fixed by correctly parsing CMaps
+    # (which should also be submitted as a PR to pdfminer.six)
+    "issue9915_reduced.pdf",
+    "issue2931.pdf",
+    "issue9534_reduced.pdf",
+    "issue18117.pdf",
+}
diff --git a/tests/test_document.py b/tests/test_document.py
index 3cae4852..7ea8576e 100644
--- a/tests/test_document.py
+++ b/tests/test_document.py
@@ -13,6 +13,7 @@
 from playa.exceptions import PDFSyntaxError
 from playa.parser import LIT
 from playa.utils import decode_text
+from .data import CONTRIB
 
 TESTDIR = Path(__file__).parent.parent / "samples"
 
@@ -54,14 +55,15 @@ def test_objects():
         # assert objects[0].obj is doc[1]
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_page_labels():
-    with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc:
+    with playa.open(CONTRIB / "pagelabels.pdf") as doc:
         labels = [label for _, label in zip(range(10), doc.page_labels)]
         assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"]
         assert doc.pages["iii"] is doc.pages[0]
         assert doc.pages["iv"] is doc.pages[1]
         assert doc.pages["2"] is doc.pages[3]
-    with playa.open("samples/2023-06-20-PV.pdf") as doc:
+    with playa.open(CONTRIB / "2023-06-20-PV.pdf") as doc:
         assert doc.pages["1"] is doc.pages[0]
         with pytest.raises(KeyError):
             _ = doc.pages["3"]
@@ -69,8 +71,9 @@ def test_page_labels():
             _ = doc.pages[2]
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_pages():
-    with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
+    with playa.open(CONTRIB / "PSC_Station.pdf") as doc:
         page_objects = list(doc.pages)
         assert len(page_objects) == 15
         objects = list(page_objects[2].contents)
@@ -79,8 +82,9 @@ def test_pages():
         assert b"diversit\xe9 " in tokens
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_names():
-    with playa.open(TESTDIR / "contrib" / "issue-625-identity-cmap.pdf") as doc:
+    with playa.open(CONTRIB / "issue-625-identity-cmap.pdf") as doc:
         ef = NameTree(doc.names["EmbeddedFiles"])
         # Because yes, they can be UTF-16... (the spec says nothing
         # about this but it appears some authoring tools assume that
@@ -92,15 +96,17 @@ def test_names():
         assert names == ["382901691/01_UBL.xml", "382901691/02_EAN_UCC.xml"]
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_dests():
-    with playa.open(TESTDIR / "pdf.js" / "issue620f.pdf") as doc:
+    with playa.open(CONTRIB / "issue620f.pdf") as doc:
         names = [name for name, _ in doc.dests]
         assert names == ["Page.1", "Page.2"]
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_outlines():
     with playa.open(
-        "samples/2023-04-06-ODJ et Résolutions-séance xtra 6 avril 2023.pdf"
+        CONTRIB / "2023-04-06-ODJ et Résolutions-séance xtra 6 avril 2023.pdf"
     ) as doc:
         titles = [o.title for o in doc.outlines]
         assert titles == [
@@ -112,8 +118,9 @@ def test_outlines():
         ]
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_xobjects() -> None:
-    with playa.open(TESTDIR / "pdf.js" / "basicapi.pdf") as doc:
+    with playa.open(CONTRIB / "basicapi.pdf") as doc:
         page = doc.pages[0]
         xobj = next(page.xobjects)
         assert xobj.object_type == "xobject"
diff --git a/tests/test_indirect_objects.py b/tests/test_indirect_objects.py
index 8adac3d1..085b5264 100644
--- a/tests/test_indirect_objects.py
+++ b/tests/test_indirect_objects.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from playa.parser import LIT, ContentStream, IndirectObjectParser
+from playa.parser import LIT, ContentStream, IndirectObjectParser, PDFSyntaxError
 
 TESTDIR = Path(__file__).parent.parent / "samples"
 
@@ -67,6 +67,6 @@ def test_streams():
     assert stream.rawdata == b"150 250 m\n150 350 l\nS\nA BUNCH OF EXTRA CRAP!!!\n"
 
     parser = IndirectObjectParser(DATA2, strict=True)
-    with pytest.raises(TypeError) as e:
+    with pytest.raises(PDFSyntaxError) as e:
         positions, objs = zip(*list(parser))
         assert "Integer" in e
diff --git a/tests/test_lazy_api.py b/tests/test_lazy_api.py
index ca263005..52ebc80a 100644
--- a/tests/test_lazy_api.py
+++ b/tests/test_lazy_api.py
@@ -10,26 +10,13 @@
 from playa.color import PREDEFINED_COLORSPACE, Color
 from playa.exceptions import PDFEncryptionError
 
-TESTDIR = Path(__file__).parent.parent / "samples"
-ALLPDFS = TESTDIR.glob("**/*.pdf")
-PASSWORDS = {
-    "base.pdf": ["foo"],
-    "rc4-40.pdf": ["foo"],
-    "rc4-128.pdf": ["foo"],
-    "aes-128.pdf": ["foo"],
-    "aes-128-m.pdf": ["foo"],
-    "aes-256.pdf": ["foo"],
-    "aes-256-m.pdf": ["foo"],
-    "aes-256-r6.pdf": ["usersecret", "ownersecret"],
-}
-XFAILS = {
-    "bogus-stream-length.pdf",
-}
+from .data import TESTDIR, ALLPDFS, PASSWORDS, XFAILS, CONTRIB
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_content_objects():
     """Ensure that we can produce all the basic content objects."""
-    with playa.open(TESTDIR / "2023-06-20-PV.pdf", space="page") as pdf:
+    with playa.open(CONTRIB / "2023-06-20-PV.pdf", space="page") as pdf:
         page = pdf.pages[0]
         img = next(page.images)
         assert img.colorspace.name == "ICCBased"
@@ -100,10 +87,11 @@ def test_uncoloured_tiling() -> None:
         assert path.gstate.ncolor == Color((0.5, 0.2, 1.0), "P1")
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_rotated_glyphs() -> None:
     """Verify that we (unlike pdfminer) properly calculate the bbox
     for rotated text."""
-    with playa.open(TESTDIR / "contrib" / "issue_495_pdfobjref.pdf") as pdf:
+    with playa.open(CONTRIB / "issue_495_pdfobjref.pdf") as pdf:
         chars = []
         for text in pdf.pages[0].texts:
             for glyph in text:
@@ -114,7 +102,3 @@ def test_rotated_glyphs() -> None:
                     width = x1 - x0
                     assert width > 6
         assert "".join(chars) == "R18,00"
-
-
-if __name__ == "__main__":
-    test_content_objects()
diff --git a/tests/test_object_parser.py b/tests/test_object_parser.py
index eda85135..ff59916a 100644
--- a/tests/test_object_parser.py
+++ b/tests/test_object_parser.py
@@ -161,9 +161,8 @@ def test_parser_miner():
 
 def test_revlines() -> None:
     """Verify that we replicate the old revreadlines method."""
-    expected = list(reversed([line for pos, line in EXPECTED2]))
     output = list(reverse_iter_lines(TESTDATA2))
-    assert output == expected
+    assert output == list(reversed(EXPECTED2))
 
 
 SIMPLE1 = b"""1 0 obj
diff --git a/tests/test_open.py b/tests/test_open.py
index 8a1910b3..f737f7b3 100644
--- a/tests/test_open.py
+++ b/tests/test_open.py
@@ -14,34 +14,27 @@
     pdfminer = None  # type: ignore
 import playa
 from playa.exceptions import PDFEncryptionError, PDFSyntaxError
+from .data import TESTDIR, BASEPDFS, PASSWORDS, XFAILS, CONTRIB
 
-TESTDIR = Path(__file__).parent.parent / "samples"
-ALLPDFS = TESTDIR.glob("**/*.pdf")
-PASSWORDS = {
-    "base.pdf": ["foo"],
-    "rc4-40.pdf": ["foo"],
-    "rc4-128.pdf": ["foo"],
-    "aes-128.pdf": ["foo"],
-    "aes-128-m.pdf": ["foo"],
-    "aes-256.pdf": ["foo"],
-    "aes-256-m.pdf": ["foo"],
-    "aes-256-r6.pdf": ["usersecret", "ownersecret"],
-}
+# We know pdfminer.six gives different output for these and we don't
+# care (generally because of PLAYA's better rectangle detection and
+# correct bboxes for rotated glyphs)
 PDFMINER_BUGS = {
     "issue-449-vertical.pdf",
     "issue_495_pdfobjref.pdf",
     "issue-1008-inline-ascii85.pdf",
     "rotated.pdf",
-}
-XFAILS = {
-    "bogus-stream-length.pdf",
+    "issue-1114-dedupe-chars.pdf",
+    "malformed-from-issue-932.pdf",
+    "mcid_example.pdf",
 }
 
 
+# Only do "base" PDFs as we know pdfminer has issues with others
 @pytest.mark.skipif(pdfminer is None, reason="pdfminer.six is not installed")
-@pytest.mark.parametrize("path", ALLPDFS, ids=str)
+@pytest.mark.parametrize("path", BASEPDFS, ids=str)
 def test_open(path: Path) -> None:
-    """Open all the documents and compare with pdfplumber"""
+    """Open all the documents and compare with pdfminer"""
     if path.name in XFAILS:
         pytest.xfail("Intentionally corrupt file: %s" % path.name)
     from pdfminer.converter import PDFPageAggregator
@@ -91,15 +84,17 @@ def convert_miner(layout):
         assert beach == miner
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_inline_data() -> None:
-    with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc:
+    with playa.open(CONTRIB / "issue-1008-inline-ascii85.pdf") as doc:
         page = doc.pages[0]
         items = list(page.layout)
         assert len(items) == 456
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_redundant_h() -> None:
-    with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc:
+    with playa.open(CONTRIB / "issue-1008-inline-ascii85.pdf") as doc:
         page = doc.pages[0]
         rects = [item for item in page.layout if item["object_type"] == "rect"]
         assert len(rects) == 6
@@ -113,8 +108,9 @@ def test_multiple_contents() -> None:
         assert len(items) == 898
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_xobjects() -> None:
-    with playa.open(TESTDIR / "pdf.js" / "basicapi.pdf") as doc:
+    with playa.open(CONTRIB / "basicapi.pdf") as doc:
         objs = [obj for obj in doc.layout if obj.get("xobjid")]
     assert objs
     assert objs[0]["xobjid"] == "XT5"
@@ -141,16 +137,17 @@ def test_write_csv() -> None:
         # print(out.getvalue())
 
 
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
 def test_spaces() -> None:
     """Test different coordinate spaces."""
-    with playa.open(TESTDIR / "pdfplumber" / "issue-1181.pdf", space="page") as doc:
+    with playa.open(CONTRIB / "issue-1181.pdf", space="page") as doc:
         page = doc.pages[0]
         page_box = next(iter(page)).bbox
-    with playa.open(TESTDIR / "pdfplumber" / "issue-1181.pdf", space="user") as doc:
+    with playa.open(CONTRIB / "issue-1181.pdf", space="user") as doc:
         page = doc.pages[0]
         user_box = next(iter(page)).bbox
     assert page_box[1] == pytest.approx(user_box[1] - page.mediabox[1])
-    with playa.open(TESTDIR / "pdfplumber" / "issue-1181.pdf", space="screen") as doc:
+    with playa.open(CONTRIB / "issue-1181.pdf", space="screen") as doc:
         page = doc.pages[0]
         screen_box = next(iter(page)).bbox
     # BBoxes are normalied, so top is 1 for screen and 3 for page
@@ -185,12 +182,7 @@ def test_glyph_offsets() -> None:
 
 
 def test_tiff_predictor() -> None:
-    with playa.open(TESTDIR / "contrib" / "test_pdf_with_tiff_predictor.pdf") as doc:
+    with playa.open(TESTDIR / "test_pdf_with_tiff_predictor.pdf") as doc:
         image = next(doc.pages[0].images)
         # Decoded TIFF: 600 x 600 + a header
         assert len(image.stream.buffer) == 360600
-
-
-def test_bogus_stream_length() -> None:
-    with pytest.raises(PDFSyntaxError):
-        _ = playa.open(TESTDIR / "bogus-stream-length.pdf")
diff --git a/tests/test_structtree.py b/tests/test_structtree.py
index 3c4f2284..a9cf32bf 100644
--- a/tests/test_structtree.py
+++ b/tests/test_structtree.py
@@ -1,76 +1,75 @@
 import re
-import unittest
-from pathlib import Path
 
+import pytest
 import playa
+from .data import CONTRIB, TESTDIR
 
-TESTDIR = Path(__file__).parent.parent / "samples"
 
+def test_structure_tree_class() -> None:
+    with playa.open(TESTDIR / "image_structure.pdf") as pdf:
+        stree = pdf.pages[0].structtree
+        doc_elem = next(iter(stree))
+        assert [k.type for k in doc_elem] == ["P", "P", "Figure"]
 
-class TestClass(unittest.TestCase):
-    """Test the underlying Structure tree class"""
 
-    def test_structure_tree_class(self) -> None:
-        with playa.open(TESTDIR / "image_structure.pdf") as pdf:
-            stree = pdf.pages[0].structtree
-            doc_elem = next(iter(stree))
-            assert [k.type for k in doc_elem] == ["P", "P", "Figure"]
+def test_find_all_tree() -> None:
+    """
+    Test find_all() and find() on trees
+    """
+    with playa.open(TESTDIR / "image_structure.pdf") as pdf:
+        stree = pdf.pages[0].structtree
+        figs = list(stree.find_all("Figure"))
+        assert len(figs) == 1
+        fig = stree.find("Figure")
+        assert fig == figs[0]
+        assert stree.find("Fogure") is None
+        figs = list(stree.find_all(re.compile(r"Fig.*")))
+        assert len(figs) == 1
+        figs = list(stree.find_all(lambda x: x.type == "Figure"))
+        assert len(figs) == 1
+        figs = list(stree.find_all("Foogure"))
+        assert len(figs) == 0
+        figs = list(stree.find_all(re.compile(r"Fog.*")))
+        assert len(figs) == 0
+        figs = list(stree.find_all(lambda x: x.type == "Flogger"))
+        assert len(figs) == 0
 
-    def test_find_all_tree(self) -> None:
-        """
-        Test find_all() and find() on trees
-        """
-        with playa.open(TESTDIR / "image_structure.pdf") as pdf:
-            stree = pdf.pages[0].structtree
-            figs = list(stree.find_all("Figure"))
-            assert len(figs) == 1
-            fig = stree.find("Figure")
-            assert fig == figs[0]
-            assert stree.find("Fogure") is None
-            figs = list(stree.find_all(re.compile(r"Fig.*")))
-            assert len(figs) == 1
-            figs = list(stree.find_all(lambda x: x.type == "Figure"))
-            assert len(figs) == 1
-            figs = list(stree.find_all("Foogure"))
-            assert len(figs) == 0
-            figs = list(stree.find_all(re.compile(r"Fog.*")))
-            assert len(figs) == 0
-            figs = list(stree.find_all(lambda x: x.type == "Flogger"))
-            assert len(figs) == 0
 
-    def test_find_all_element(self) -> None:
-        """
-        Test find_all() and find() on elements
-        """
-        with playa.open(TESTDIR / "pdf_structure.pdf") as pdf:
-            stree = pdf.structtree
-            for list_elem in stree.find_all("L"):
-                items = list(list_elem.find_all("LI"))
-                assert items
-                for item in items:
-                    body = list(item.find_all("LBody"))
-                    assert body
-                    body1 = item.find("LBody")
-                    assert body1 == body[0]
-                    assert item.find("Loonie") is None
+def test_find_all_element() -> None:
+    """
+    Test find_all() and find() on elements
+    """
+    with playa.open(TESTDIR / "pdf_structure.pdf") as pdf:
+        stree = pdf.structtree
+        for list_elem in stree.find_all("L"):
+            items = list(list_elem.find_all("LI"))
+            assert items
+            for item in items:
+                body = list(item.find_all("LBody"))
+                assert body
+                body1 = item.find("LBody")
+                assert body1 == body[0]
+                assert item.find("Loonie") is None
 
-    def test_all_mcids(self) -> None:
-        """
-        Test all_mcids()
-        """
-        with playa.open(TESTDIR / "2023-06-20-PV.pdf") as pdf:
-            # Make sure we can get them with page numbers
-            stree = pdf.structtree
-            sect = next(stree.find_all("Sect"))
-            mcids = list(sect.all_mcids())
-            page_indices = set(page for page, mcid in mcids)
-            assert 0 in page_indices
-            assert 1 in page_indices
 
-            stree = pdf.pages[1].structtree
-            sect = next(stree.find_all("Sect"))
-            mcids = list(sect.all_mcids())
-            page_indices = set(page for page, mcid in mcids)
-            assert page_indices == {1}
-            for p in sect.find_all("P"):
-                assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)
+@pytest.mark.skipif(not CONTRIB.exists(), reason="contrib samples not present")
+def test_all_mcids() -> None:
+    """
+    Test all_mcids()
+    """
+    with playa.open(CONTRIB / "2023-06-20-PV.pdf") as pdf:
+        # Make sure we can get them with page numbers
+        stree = pdf.structtree
+        sect = next(stree.find_all("Sect"))
+        mcids = list(sect.all_mcids())
+        page_indices = set(page for page, mcid in mcids)
+        assert 0 in page_indices
+        assert 1 in page_indices
+
+        stree = pdf.pages[1].structtree
+        sect = next(stree.find_all("Sect"))
+        mcids = list(sect.all_mcids())
+        page_indices = set(page for page, mcid in mcids)
+        assert page_indices == {1}
+        for p in sect.find_all("P"):
+            assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)