diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml
index 251e817..b49c520 100644
--- a/.github/workflows/package.yml
+++ b/.github/workflows/package.yml
@@ -13,9 +13,6 @@ jobs:
     - uses: pypa/cibuildwheel@v2.22.0
       with:
         output-dir: wheelhouse
-      env:
-        CIBW_BUILD: cp* pp*
-        CIBW_TEST_COMMAND: python -m unittest discover -s {package}/tests -t {package}
     - uses: actions/upload-artifact@v4
       with:
         name: wheels-${{ matrix.os }}-${{ strategy.job-index }}
diff --git a/.gitignore b/.gitignore
index 5370168..f2070e1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 /dist/
 /src/*.egg-info
 /src/*.so
+
+__pycache__
diff --git a/pyproject.toml b/pyproject.toml
index 39ed48d..65029e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pysorteddict"
-version = "0.1.0"
+version = "0.2.0"
 authors = [
     {name = "Vishal Pankaj Chandratreya"},
 ]
@@ -36,14 +36,20 @@ classifiers = [
 "Repository" = "https://github.com/tfpf/pysorteddict"
 "Bug Tracker" = "https://github.com/tfpf/pysorteddict/issues"
 
+[tool.cibuildwheel]
+build = "cp* pp*"
+test-command = "pytest {package}"
+test-requires = ["pytest"]
+
 # No use of an editable installation. If the code is edited, it has to be
 # recompiled, and the package has to be reinstalled.
 [tool.hatch.envs.default]
 dev-mode = false
+dependencies = ["pytest"]
 
 [tool.ruff]
 line-length = 119
 
 [tool.ruff.lint.per-file-ignores]
 "examples/*" = ["T201"]
-"tests/*" = ["PT009", "PT027", "S311"]
+"tests/*" = ["PLR2004", "S101", "S311"]
diff --git a/tests/test_int_keys.py b/tests/test_int_keys.py
deleted file mode 100644
index f0ae71e..0000000
--- a/tests/test_int_keys.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import unittest
-
-from tests.utils import TestGenericKeys
-
-
-class TestIntKeys(TestGenericKeys, unittest.TestCase):
-    """Test a sorted dictionary with ``int`` keys."""
-
-    def small_key(self):
-        return self.rg.randrange(1000, 2000)
-
-    def large_key(self):
-        return self.rg.randrange(2000, 3000)
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls.wrong_argument = "key must be of type <class 'int'>"
-
-    def setUp(self):
-        super().setUp(int)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_invalid_construction.py b/tests/test_invalid_construction.py
index 8fe071b..5fbaa10 100644
--- a/tests/test_invalid_construction.py
+++ b/tests/test_invalid_construction.py
@@ -1,26 +1,22 @@
-import unittest
+import pytest
 
 from pysorteddict import SortedDict
 
 
-class TestInvalidConstruction(unittest.TestCase):
-    """Test invalid construction of a sorted dictionary."""
+def test_no_arguments():
+    with pytest.raises(TypeError) as ctx:
+        SortedDict()
+    assert ctx.value.args[0] == "function missing required argument 'key_type' (pos 1)"
 
-    @classmethod
-    def setUpClass(cls):
-        cls.missing_argument = "function missing required argument 'key_type' (pos 1)"
-        cls.wrong_argument = "constructor argument must be a supported type"
 
-    def test_construct_without_argument(self):
-        with self.assertRaises(TypeError) as ctx:
-            SortedDict()
-        self.assertEqual(self.missing_argument, ctx.exception.args[0])
+def test_superfluous_arguments():
+    with pytest.raises(TypeError) as ctx:
+        SortedDict(object, object)
+    assert ctx.value.args[0] == "function takes at most 1 argument (2 given)"
 
-    def test_construct_with_object_instance(self):
-        with self.assertRaises(TypeError) as ctx:
-            SortedDict(object())
-        self.assertEqual(self.wrong_argument, ctx.exception.args[0])
 
-
-if __name__ == "__main__":
-    unittest.main()
+@pytest.mark.parametrize("key_type", [object, object(), 63, 5.31, "effort", b"salt", ["hear", 0x5EE], (1.61, "taste")])
+def test_wrong_type(key_type):
+    with pytest.raises(TypeError) as ctx:
+        SortedDict(key_type)
+    assert ctx.value.args[0] == "constructor argument must be a supported type"
diff --git a/tests/test_methods.py b/tests/test_methods.py
new file mode 100644
index 0000000..a3f6e90
--- /dev/null
+++ b/tests/test_methods.py
@@ -0,0 +1,224 @@
+import builtins
+import platform
+import random
+import sys
+
+import pytest
+
+from pysorteddict import SortedDict
+
+# Reference counting is specific to CPython, so record this for later.
+cpython = platform.python_implementation() == "CPython"
+
+
+class Resources:
+    """
+    Store resources used to generate similar test cases for different key
+    types.
+    """
+
+    def __init__(self, key_type: type):
+        self.key_type = key_type
+        self.key_subtype = type("sub" + self.key_type.__name__, (self.key_type,), {})
+
+        self.rg = random.Random(__name__)
+        self.keys = [self.gen() for _ in range(1000)]
+        self.values = [self.gen() for _ in self.keys]
+        self.normal_dict = dict(zip(self.keys, self.values, strict=True))
+
+        sorted_dict = SortedDict(self.key_type)
+        for key, value in zip(self.keys, self.values, strict=True):
+            sorted_dict[key] = value
+        self.sorted_dicts = [sorted_dict, sorted_dict.copy()]
+
+        # Store the reference count of an item in a list at the position at
+        # which it appears in the normal dictionary. The reference counts are
+        # all 4, but querying the reference count increases it, so I store 5.
+        # Whenever a test changes the reference count of any item, I set the
+        # new reference count at its index.
+        self.keys = [*self.normal_dict]
+        self.keys_refcounts = [5] * len(self.keys)
+        self.values = [*self.normal_dict.values()]
+        self.values_refcounts = [5] * len(self.values)
+
+    def gen(self, *, small: bool = True) -> int:
+        """
+        Generate a key or value for a dictionary. It will be a new object (i.e.
+        not an interned one).
+
+        :param large: Whether to generate a small or large key/value. (A small
+        one will never be equal to a large one.)
+
+        :return: Random result.
+        """
+        match self.key_type:
+            # The pattern must be non-capturing (otherwise, it raises a syntax
+            # error because the remaining patterns become unreachable). Hence,
+            # whenever the pattern is an existing name which can be shadowed,
+            # it has to be written like this.
+            case builtins.int:
+                if small:
+                    return self.rg.randrange(1000, 2000)
+                return self.rg.randrange(2000, 3000)
+
+            case _:
+                raise RuntimeError
+
+
+@pytest.fixture
+def resources(request):
+    """
+    Create test resources for the given key type (passed as a parameter to this
+    fixture).
+    """
+    resources = Resources(request.param)
+    yield resources
+
+    # Tearing down: verify the reference counts.
+    if cpython:
+        for observed, expected in zip(map(sys.getrefcount, resources.keys), resources.keys_refcounts, strict=False):
+            assert observed == expected
+        for observed, expected in zip(
+            map(sys.getrefcount, resources.values), resources.values_refcounts, strict=False
+        ):
+            assert observed == expected
+
+
+@pytest.fixture
+def sorted_dict(request, resources):
+    """
+    Obtain either the sorted dictionary or its copy based on the index (passed
+    as a parameter to this fixture). The aim is to test both it and its copy
+    with the same rigour.
+    """
+    sorted_dict = resources.sorted_dicts[request.param]
+    yield sorted_dict
+
+    # Tearing down: verify some non-mutating methods.
+    assert len(sorted_dict) == len(resources.normal_dict)
+    assert str(sorted_dict) == str(dict(sorted(resources.normal_dict.items())))
+    assert sorted_dict.items() == sorted(resources.normal_dict.items())
+    assert sorted_dict.keys() == sorted(resources.normal_dict)
+    assert sorted_dict.values() == [item[1] for item in sorted(resources.normal_dict.items())]
+
+
+# Run each test with each key type, and on the sorted dictionary and its copy.
+pytestmark = [
+    pytest.mark.parametrize("resources", [int], indirect=True),
+    pytest.mark.parametrize("sorted_dict", [0, 1], ids=["original", "copy"], indirect=True),
+]
+
+
+def test_contains_wrong_type(resources, sorted_dict):
+    assert resources.key_subtype() not in sorted_dict
+
+
+def test_contains_no(resources, sorted_dict):
+    key = resources.gen(small=False)
+    assert key not in sorted_dict
+
+    if cpython:
+        assert sys.getrefcount(key) == 2
+
+
+def test_contains_yes(resources, sorted_dict):
+    key = resources.rg.choice(resources.keys)
+    assert key in sorted_dict
+
+    if cpython:
+        assert sys.getrefcount(key) == 6
+
+
+def test_getitem_wrong_type(resources, sorted_dict):
+    with pytest.raises(TypeError) as ctx:
+        sorted_dict[resources.key_subtype()]
+    assert ctx.value.args[0] == f"key must be of type {resources.key_type!r}"
+
+
+def test_getitem_missing(resources, sorted_dict):
+    key = resources.gen(small=False)
+    with pytest.raises(KeyError) as ctx:
+        sorted_dict[key]
+    assert ctx.value.args[0] == key
+
+    if cpython:
+        assert sys.getrefcount(key) == 3
+
+
+def test_getitem_found(resources, sorted_dict):
+    key = resources.rg.choice(resources.keys)
+    value = sorted_dict[key]
+    assert value == resources.normal_dict[key]
+
+    if cpython:
+        assert sys.getrefcount(key) == 6
+        assert sys.getrefcount(value) == 6
+
+
+def test_delitem_wrong_type(resources, sorted_dict):
+    with pytest.raises(TypeError) as ctx:
+        del sorted_dict[resources.key_subtype()]
+    assert ctx.value.args[0] == f"key must be of type {resources.key_type!r}"
+
+
+def test_delitem_missing(resources, sorted_dict):
+    key = resources.gen(small=False)
+    with pytest.raises(KeyError) as ctx:
+        del sorted_dict[key]
+    assert ctx.value.args[0] == key
+
+    if cpython:
+        assert sys.getrefcount(key) == 3
+
+
+def test_delitem_found(resources, sorted_dict):
+    idx, key = resources.rg.choice([*enumerate(resources.keys)])
+    del resources.normal_dict[key]
+    del sorted_dict[key]
+    assert key not in sorted_dict
+
+    if cpython:
+        resources.keys_refcounts[idx] -= 2
+        resources.values_refcounts[idx] -= 2
+
+
+def test_setitem_wrong_type(resources, sorted_dict):
+    value = resources.gen()
+    with pytest.raises(TypeError) as ctx:
+        sorted_dict[resources.key_subtype()] = value
+    assert ctx.value.args[0] == f"key must be of type {resources.key_type!r}"
+
+    if cpython:
+        assert sys.getrefcount(value) == 2
+
+
+def test_setitem_insert(resources, sorted_dict):
+    key, value = resources.gen(small=False), resources.gen()
+    resources.normal_dict[key] = value
+    sorted_dict[key] = value
+    assert sorted_dict[key] == value
+
+    if cpython:
+        assert sys.getrefcount(key) == 4
+        assert sys.getrefcount(value) == 4
+
+
+def test_setitem_overwrite(resources, sorted_dict):
+    idx, key = resources.rg.choice([*enumerate(resources.keys)])
+    value = resources.gen()
+    resources.normal_dict[key] = value
+    sorted_dict[key] = value
+    assert sorted_dict[key] == value
+
+    if cpython:
+        assert sys.getrefcount(value) == 4
+        resources.values_refcounts[idx] -= 2
+
+
+def test_clear(resources, sorted_dict):
+    resources.normal_dict.clear()
+    sorted_dict.clear()
+
+    if cpython:
+        resources.keys_refcounts = [3] * len(resources.keys)
+        resources.values_refcounts = [3] * len(resources.values)
diff --git a/tests/utils.py b/tests/utils.py
deleted file mode 100644
index 45af1ac..0000000
--- a/tests/utils.py
+++ /dev/null
@@ -1,266 +0,0 @@
-import platform
-import random
-import sys
-
-from pysorteddict import SortedDict
-
-
-class TestGenericKeys:
-    """
-    Subclass this class to produce similar test cases for each key type. Name
-    it first in the inheritance list to ensure correct method resolution.
-    """
-
-    def small_key(self):
-        """
-        Override this method to generate a random key. The set of possible
-        outputs of this function should be disjoint with that of ``large_key``.
-        The key should be a new object rather than an interned one.
-        """
-
-    def large_key(self):
-        """
-        Override this method to generate a random key. The set of possible
-        outputs of this function should be disjoint with that of ``small_key``.
-        The key should be a new object rather than an interned one.
-        """
-
-    @classmethod
-    def setUpClass(cls):
-        cls.cpython = platform.python_implementation() == "CPython"
-        cls.wrong_argument = ""
-
-    def setUp(self, key_type: type):
-        self.key_type = key_type
-        self.key_subtype = type("sub" + self.key_type.__name__, (self.key_type,), {})
-        self.rg = random.Random(__name__)
-        self.keys = [self.small_key() for _ in range(1000)]
-        self.values = [self.small_key() for _ in self.keys]
-        self.normal_dict = dict(zip(self.keys, self.values, strict=True))
-        self.sorted_dict = SortedDict(self.key_type)
-        for key, value in zip(self.keys, self.values, strict=True):
-            self.sorted_dict[key] = value
-
-        # Store the reference count of an item in a list at the position at
-        # which it appears in the normal dictionary. At this point, the
-        # reference counts are all 3, but querying the reference count
-        # increases it, so I store 4. Whenever a test changes the reference
-        # count of any item, I set the new reference count at its index.
-        # Remember that reference counting is specific to the CPython
-        # implementation.
-        if self.cpython:
-            self.keys_refcounts = [4] * len(self.normal_dict)
-            self.values_refcounts = [4] * len(self.normal_dict)
-
-    def test_contains_wrong_type(self):
-        self.assertTrue(self.key_subtype() not in self.sorted_dict)
-
-    def test_contains_false(self):
-        key = self.large_key()
-        self.assertTrue(key not in self.sorted_dict)
-
-        if self.cpython:
-            self.assertEqual(2, sys.getrefcount(key))
-
-    def test_contains_true(self):
-        key = self.rg.choice(self.keys)
-        self.assertTrue(key in self.sorted_dict)
-
-        if self.cpython:
-            self.assertEqual(5, sys.getrefcount(key))
-
-    def test_len(self):
-        self.assertEqual(len(self.normal_dict), len(self.sorted_dict))
-
-    def test_getitem_wrong_type(self):
-        with self.assertRaises(TypeError) as ctx:
-            self.sorted_dict[self.key_subtype()]
-        self.assertEqual(self.wrong_argument, ctx.exception.args[0])
-
-    def test_getitem_not_found(self):
-        key = self.large_key()
-        with self.assertRaises(KeyError) as ctx:
-            self.sorted_dict[key]
-        self.assertEqual(key, ctx.exception.args[0])
-
-        if self.cpython:
-            self.assertEqual(3, sys.getrefcount(key))
-
-    def test_getitem(self):
-        key = self.rg.choice(self.keys)
-        value = self.sorted_dict[key]
-        self.assertEqual(self.normal_dict[key], value)
-
-        if self.cpython:
-            self.assertEqual(5, sys.getrefcount(key))
-            self.assertEqual(5, sys.getrefcount(value))
-
-    def test_setitem_wrong_type(self):
-        value = self.small_key()
-        with self.assertRaises(TypeError) as ctx:
-            self.sorted_dict[self.key_subtype()] = value
-        self.assertEqual(self.wrong_argument, ctx.exception.args[0])
-
-        if self.cpython:
-            self.assertEqual(2, sys.getrefcount(value))
-
-    def test_setitem_existing(self):
-        idx, key = self.rg.choice([*enumerate(self.normal_dict)])
-        value = self.small_key()
-        self.sorted_dict[key] = value
-        self.assertEqual(value, self.sorted_dict[key])
-
-        if self.cpython:
-            self.assertEqual(5, sys.getrefcount(key))
-            self.assertEqual(3, sys.getrefcount(value))
-            self.values_refcounts[idx] -= 1
-
-    def test_setitem_new(self):
-        key = self.large_key()
-        value = self.small_key()
-        self.sorted_dict[key] = value
-        self.assertEqual(value, self.sorted_dict[key])
-
-        if self.cpython:
-            self.assertEqual(3, sys.getrefcount(key))
-            self.assertEqual(3, sys.getrefcount(value))
-
-    def test_setitem_remove_not_found(self):
-        key = self.large_key()
-        with self.assertRaises(KeyError) as ctx:
-            del self.sorted_dict[key]
-        self.assertEqual(key, ctx.exception.args[0])
-
-        if self.cpython:
-            self.assertEqual(3, sys.getrefcount(key))
-
-    def test_setitem_remove_existing(self):
-        idx, key = self.rg.choice([*enumerate(self.normal_dict)])
-        del self.sorted_dict[key]
-        with self.assertRaises(KeyError) as ctx:
-            self.sorted_dict[key]
-        self.assertEqual(key, ctx.exception.args[0])
-
-        if self.cpython:
-            self.assertEqual(5, sys.getrefcount(key))
-            self.keys_refcounts[idx] -= 1
-            self.values_refcounts[idx] -= 1
-
-    def test_str(self):
-        self.assertEqual(str(dict(sorted(self.normal_dict.items()))), str(self.sorted_dict))
-
-    def test_str_preserved(self):
-        self._str = str(self.sorted_dict)
-        self.assertEqual(str(dict(sorted(self.normal_dict.items()))), self._str)
-
-        if self.cpython:
-            self.assertEqual(2, sys.getrefcount(self._str))
-
-    def test_clear(self):
-        self.sorted_dict.clear()
-        self.assertEqual("{}", str(self.sorted_dict))
-        self.assertEqual(0, len(self.sorted_dict))
-
-        if self.cpython:
-            self.keys_refcounts = [3] * len(self.normal_dict)
-            self.values_refcounts = [3] * len(self.normal_dict)
-
-    def test_items(self):
-        self.assertEqual(sorted(self.normal_dict.items()), self.sorted_dict.items())
-
-    def test_items_preserved(self):
-        self._items = self.sorted_dict.items()
-        self.assertEqual(sorted(self.normal_dict.items()), self._items)
-
-        if self.cpython:
-            self.assertEqual(2, sys.getrefcount(self._items))
-            self.keys_refcounts = [5] * len(self.normal_dict)
-            self.values_refcounts = [5] * len(self.normal_dict)
-
-    def test_keys(self):
-        self.assertEqual(sorted(self.normal_dict.keys()), self.sorted_dict.keys())
-
-    def test_keys_preserved(self):
-        self._keys = self.sorted_dict.keys()
-        self.assertEqual(sorted(self.normal_dict.keys()), self._keys)
-
-        if self.cpython:
-            self.assertEqual(2, sys.getrefcount(self._keys))
-            self.keys_refcounts = [5] * len(self.normal_dict)
-
-    def test_values(self):
-        self.assertEqual([item[1] for item in sorted(self.normal_dict.items())], self.sorted_dict.values())
-
-    def test_values_preserved(self):
-        self._values = self.sorted_dict.values()
-        self.assertEqual([item[1] for item in sorted(self.normal_dict.items())], self._values)
-
-        if self.cpython:
-            self.assertEqual(2, sys.getrefcount(self._values))
-            self.values_refcounts = [5] * len(self.normal_dict)
-
-    def test_del(self):
-        del self.sorted_dict
-
-        if self.cpython:
-            self.keys_refcounts = [3] * len(self.normal_dict)
-            self.values_refcounts = [3] * len(self.normal_dict)
-
-    def test_stress(self):
-        self.normal_dict = {}
-        self.sorted_dict = SortedDict(self.key_type)
-        for method in self.rg.choices(["__contains__", "__delitem__", "__getitem__", "__setitem__", "clear"], k=10000):
-            key, value = self.small_key(), self.small_key()
-            match method:
-                case "__contains__":
-                    self.assertEqual(key in self.normal_dict, key in self.sorted_dict)
-
-                case "__delitem__":
-                    try:
-                        del self.normal_dict[key]
-                    except KeyError as exc:
-                        with self.assertRaises(KeyError) as ctx:
-                            del self.sorted_dict[key]
-                        self.assertEqual(exc.args[0], ctx.exception.args[0])
-                    else:
-                        del self.sorted_dict[key]
-
-                case "__getitem__":
-                    try:
-                        value = self.normal_dict[key]
-                    except KeyError as exc:
-                        with self.assertRaises(KeyError) as ctx:
-                            self.sorted_dict[key]
-                        self.assertEqual(exc.args[0], ctx.exception.args[0])
-                    else:
-                        self.assertEqual(value, self.sorted_dict[key])
-
-                case "__setitem__":
-                    self.normal_dict[key] = value
-                    self.sorted_dict[key] = value
-                    self.assertEqual(value, self.sorted_dict[key])
-
-                case "clear":
-                    self.normal_dict.clear()
-                    self.sorted_dict.clear()
-
-            # I know this is bad, but can't think of a better way to avoid
-            # repeating code.
-            self.test_len()
-            self.test_str()
-            self.test_items()
-            self.test_keys()
-            self.test_values()
-
-        if self.cpython:
-            self.keys_refcounts = [3] * len(self.normal_dict)
-            self.values_refcounts = [3] * len(self.normal_dict)
-
-    def tearDown(self):
-        if self.cpython:
-            for expected, observed in zip(self.keys_refcounts, map(sys.getrefcount, self.normal_dict), strict=True):
-                self.assertEqual(expected, observed)
-            for expected, observed in zip(
-                self.values_refcounts, map(sys.getrefcount, self.normal_dict.values()), strict=True
-            ):
-                self.assertEqual(expected, observed)