diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 484cf2ae2..53e2c419e 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -21,6 +21,9 @@
 * Lightning-Kokkos migrated to the new device API.
   [(#810)](https://github.com/PennyLaneAI/pennylane-lightning/pull/810)
 
+* Lightning-GPU migrated to the new device API.
+  [(#853)](https://github.com/PennyLaneAI/pennylane-lightning/pull/853)
+
 ### Breaking changes
 
 * Deprecate PI gates implementation.
diff --git a/.github/workflows/wheel_noarch.yml b/.github/workflows/wheel_noarch.yml
index 11460cac1..0414fcd7b 100644
--- a/.github/workflows/wheel_noarch.yml
+++ b/.github/workflows/wheel_noarch.yml
@@ -50,7 +50,6 @@ jobs:
         if: ${{ matrix.pl_backend == 'lightning_qubit'}}
         uses: actions/checkout@v4
 
-
       - uses: actions/setup-python@v5
         if: ${{ matrix.pl_backend == 'lightning_qubit'}}
         with:
diff --git a/MANIFEST.in b/MANIFEST.in
index 4c1a79b51..23ba93b56 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,7 +3,7 @@ include cmake/*
 include requirements.txt
 include .github/CHANGELOG.md
 include pennylane_lightning/lightning_qubit/lightning_qubit.toml
-include pennylane_lightning/lightning_qpu/lightning_gpu.toml
+include pennylane_lightning/lightning_gpu/lightning_gpu.toml
 include pennylane_lightning/lightning_kokkos/lightning_kokkos.toml
 include pennylane_lightning/core/_version.py
 graft pennylane_lightning/core/src/
diff --git a/Makefile b/Makefile
index f43c9e903..5973200c5 100644
--- a/Makefile
+++ b/Makefile
@@ -35,9 +35,11 @@ help:
 	@echo "  test-cpp [verbose=1]     to run the C++ test suite (requires CMake)"
 	@echo "                           use with 'verbose=1' for building with verbose flag"
 	@echo "  test-cpp [target=?]      to run a specific C++ test target (requires CMake)."
+	@echo "  test-cpp-mpi [backend=?] to run the C++ test suite with MPI (requires CMake and MPI)"
+	@echo "                           Default: lightning_gpu"
 	@echo "  test-python [device=?]   to run the Python test suite"
 	@echo "                           Default: lightning.qubit"
-	@echo "  wheel [backend=?]        to configure and build Python wheels
+	@echo "  wheel [backend=?]        to configure and build Python wheels"
 	@echo "                           Default: lightning_qubit"
 	@echo "  coverage [device=?]      to generate a coverage report for python interface"
 	@echo "                           Default: lightning.qubit"
@@ -98,7 +100,7 @@ coverage-cpp:
 	lcov --directory . -b ../pennylane_lightning/core/src/ --capture --output-file coverage.info; \
 	genhtml coverage.info --output-directory out
 
-.PHONY: test-python test-builtin test-suite test-cpp
+.PHONY: test-python test-builtin test-suite test-cpp test-cpp-mpi
 test-python: test-builtin test-suite
 
 test-builtin:
@@ -124,6 +126,27 @@ else
 	cmake --build ./BuildTests $(VERBOSE) --target test
 endif
 
+test-cpp-mpi:
+	rm -rf ./BuildTests
+	cmake -BBuildTests -G Ninja \
+		  -DCMAKE_BUILD_TYPE=Debug \
+		  -DBUILD_TESTS=ON \
+		  -DENABLE_WARNINGS=ON \
+		  -DPL_BACKEND=lightning_gpu \
+		  -DENABLE_MPI=ON \
+		  $(OPTIONS)
+ifdef target
+	cmake --build ./BuildTests $(VERBOSE) --target $(target)
+	mpirun -np 2 ./BuildTests/$(target)
+else
+	cmake --build ./BuildTests $(VERBOSE)
+	for file in ./BuildTests/*_test_runner_mpi; do \
+		echo "Running $$file"; \
+		mpirun -np 2 $$file ; \
+	done
+endif
+
+
 .PHONY: format format-cpp format-python
 format: format-cpp format-python
 
diff --git a/doc/lightning_gpu/device.rst b/doc/lightning_gpu/device.rst
index a5162c757..405ea9764 100644
--- a/doc/lightning_gpu/device.rst
+++ b/doc/lightning_gpu/device.rst
@@ -11,9 +11,9 @@ A ``lightning.gpu`` device can be loaded using:
     import pennylane as qml
     dev = qml.device("lightning.gpu", wires=2)
 
-If the NVIDIA cuQuantum libraries are available, the above device will allow all operations to be performed on a CUDA capable GPU of generation SM 7.0 (Volta) and greater. If the libraries are not correctly installed, or available on path, the device will fall-back to ``lightning.qubit`` and perform all simulation on the CPU.
+If the NVIDIA cuQuantum libraries are available, the above device will allow all operations to be performed on a CUDA capable GPU of generation SM 7.0 (Volta) and greater. If the libraries are not correctly installed, or available on path, the device will raise an error.
 
-The ``lightning.gpu`` device also directly supports quantum circuit gradients using the adjoint differentiation method. This can be enabled at the PennyLane QNode level with:
+The ``lightning.gpu`` device supports quantum circuit gradients using the adjoint differentiation method by default. This can be enabled at the PennyLane QNode level with:
 
 .. code-block:: python
 
@@ -281,3 +281,6 @@ To enable the memory-optimized adjoint method with MPI support, ``batch_obs`` sh
     dev = qml.device('lightning.gpu', wires= n_wires, mpi=True, batch_obs=True)
 
 For the adjoint method, each MPI process will provide the overall simulation results.
+
+.. note::
+    The observable ``Projector``` does not have support with the multi-GPU backend.
diff --git a/mpitests/conftest.py b/mpitests/conftest.py
index a2084f2a5..552cf9f33 100644
--- a/mpitests/conftest.py
+++ b/mpitests/conftest.py
@@ -98,6 +98,13 @@ def get_device():
 # Device specification
 if device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
+    from pennylane_lightning.lightning_gpu._measurements import (
+        LightningGPUMeasurements as LightningMeasurements,
+    )
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
+
 else:
     raise qml.DeviceError(f"The MPI tests do not apply to the {device_name} device.")
 
diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 6f3b5c7f5..9d56dfdb1 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -26,17 +26,15 @@
 from pennylane import QNode
 from pennylane import numpy as np
 from pennylane import qnode
+from pennylane.devices import ExecutionConfig
+from pennylane.tape import QuantumScript
 from scipy.stats import unitary_group
 
+from pennylane_lightning.lightning_gpu_ops import LightningException
+
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
-I, X, Y, Z = (
-    np.eye(2),
-    qml.PauliX.compute_matrix(),
-    qml.PauliY.compute_matrix(),
-    qml.PauliZ.compute_matrix(),
-)
 
 # Tuple passed to distributed device ctor
 # np.complex for data type and True or False
@@ -59,265 +57,255 @@ def fixture_dev(request):
     )
 
 
-def Rx(theta):
-    r"""One-qubit rotation about the x axis.
-
-    Args:
-        theta (float): rotation angle
-    Returns:
-        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_x \theta/2}`
-    """
-    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * X
-
-
-def Ry(theta):
-    r"""One-qubit rotation about the y axis.
-
-    Args:
-        theta (float): rotation angle
-    Returns:
-        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_y \theta/2}`
-    """
-    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * Y
-
-
-def Rz(theta):
-    r"""One-qubit rotation about the z axis.
-
-    Args:
-        theta (float): rotation angle
-    Returns:
-        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_z \theta/2}`
-    """
-    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * Z
-
-
 class TestAdjointJacobian:  # pylint: disable=too-many-public-methods
     """Tests for the adjoint_jacobian method"""
 
-    def test_not_expval(self, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_not_expval(self, dev, batch_obs):
         """Test if a QuantumFunctionError is raised for a tape with measurements that are not
         expectation values"""
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.1, wires=0)
-            qml.var(qml.PauliZ(0))
+        qs = QuantumScript([qml.RX(1.23, 0)], [qml.var(qml.PauliZ(0))], trainable_params=[0])
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
         with pytest.raises(
             qml.QuantumFunctionError, match="Adjoint differentiation method does not"
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.1, wires=0)
-            qml.state()
+        qs = QuantumScript([qml.RX(1.23, 0)], [qml.state()], trainable_params=[0])
 
-        if device_name == "lightning.gpu":
-            message = "Adjoint differentiation does not support State measurements."
-        else:
-            message = "Adjoint differentiation method does not support measurement StateMP."
         with pytest.raises(
             qml.QuantumFunctionError,
-            match=message,
+            match="Adjoint differentiation method does not support measurement StateMP.",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-    def test_finite_shots_warns(self):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_finite_shots_warns(self, dev, batch_obs):
         """Tests warning raised when finite shots specified"""
 
-        dev = qml.device(device_name, wires=8, mpi=True, shots=1)
-
-        with qml.tape.QuantumTape() as tape:
-            qml.expval(qml.PauliZ(0))
+        qs = QuantumScript(
+            [qml.RX(1.23, 0)], [qml.expval(qml.Z(0))], shots=10, trainable_params=[0]
+        )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        with pytest.warns(
-            UserWarning,
+        with pytest.raises(
+            qml.QuantumFunctionError,
             match="Requested adjoint differentiation to be computed with finite shots.",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
     def test_empty_measurements(self, dev):
         """Tests if an empty array is returned when the measurements of the tape is empty."""
 
-        with qml.tape.QuantumTape() as tape:
+        def circuit():
             qml.RX(0.4, wires=[0])
+            return qml.expval(qml.PauliZ(0))
+
+        result = QNode(circuit, dev, diff_method="adjoint")
+
+        jac = qml.grad(result)()
 
-        jac = dev.adjoint_jacobian(tape)
         assert len(jac) == 0
 
-    def test_unsupported_op(self, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_unsupported_op(self, batch_obs, dev):
         """Test if a QuantumFunctionError is raised for an unsupported operation, i.e.,
         multi-parameter operations that are not qml.Rot"""
 
-        with qml.tape.QuantumTape() as tape:
-            qml.CRot(0.1, 0.2, 0.3, wires=[0, 1])
-            qml.expval(qml.PauliZ(0))
+        qs = QuantumScript(
+            [qml.CRot(0.1, 0.2, 0.3, wires=[0, 1])],
+            [qml.expval(qml.PauliZ(0))],
+            trainable_params=[0],
+        )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
         with pytest.raises(
-            qml.QuantumFunctionError,
-            match="The CRot operation is not supported using the",
+            LightningException,
+            match="The operation is not supported using the adjoint differentiation method",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-    def test_proj_unsupported(self, dev):
+    @pytest.mark.skip("WIP: Need a deep review if LGPU accept Projector")
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_proj_unsupported(self, batch_obs, dev):
         """Test if a QuantumFunctionError is raised for a Projector observable"""
-        with qml.tape.QuantumTape() as tape:
-            qml.CRX(0.1, wires=[0, 1])
-            qml.expval(qml.Projector([0, 1], wires=[0, 1]))
+
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
+
+        qs = QuantumScript(
+            [qml.CRX(0.1, wires=[0, 1])],
+            [qml.expval(qml.Projector([0, 1], wires=[0, 1]))],
+            trainable_params=[0],
+        )
 
         with pytest.raises(
             qml.QuantumFunctionError,
             match="differentiation method does not support the Projector",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
 
-        with qml.tape.QuantumTape() as tape:
-            qml.CRX(0.1, wires=[0, 1])
-            qml.expval(qml.Projector([0], wires=[0]) @ qml.PauliZ(0))
+        qs = QuantumScript(
+            [qml.CRX(0.1, wires=[0, 1])],
+            [qml.expval(qml.Projector([0], wires=[0]) @ qml.PauliZ(0))],
+            trainable_params=[0],
+        )
 
         with pytest.raises(
             qml.QuantumFunctionError,
             match="differentiation method does not support the Projector",
         ):
-            dev.adjoint_jacobian(tape)
+            dev.compute_derivatives(qs, config)
+
+    @staticmethod
+    def tol_for_allclose(c_dtype):
+        """Compute the tolerance for allclose"""
+        return 1e-3 if c_dtype == np.complex64 else 1e-7
 
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
     @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ])
     @pytest.mark.parametrize("stateprep", [qml.QubitStateVector, qml.StatePrep])
-    def test_pauli_rotation_gradient(self, stateprep, G, theta, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_pauli_rotation_gradient(
+        self, stateprep, G, theta, batch_obs, dev
+    ):  # pylint: disable=too-many-arguments
         """Tests that the automatic gradients of Pauli rotations are correct."""
         random_state = np.array(
             [0.43593284 - 0.02945156j, 0.40812291 + 0.80158023j], requires_grad=False
         )
 
-        tape = qml.tape.QuantumScript(
-            [stateprep(random_state, 0), G(theta, 0)], [qml.expval(qml.PauliZ(0))]
+        qs = QuantumScript(
+            [stateprep(random_state, 0), G(theta, 0)],
+            [qml.expval(qml.PauliZ(0))],
+            trainable_params=[1],
         )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        tape.trainable_params = {1}
-
-        calculated_val = dev.adjoint_jacobian(tape)
+        calculated_val = dev.compute_derivatives(qs, config)
 
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         # compare to finite differences
-        tapes, fn = qml.gradients.param_shift(tape)
+        tapes, fn = qml.gradients.param_shift(qs)
         numeric_val = fn(qml.execute(tapes, dev, None))
         assert np.allclose(calculated_val, numeric_val, atol=tol, rtol=0)
 
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
     @pytest.mark.parametrize("stateprep", [qml.QubitStateVector, qml.StatePrep])
-    def test_Rot_gradient(self, stateprep, theta, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_Rot_gradient(self, stateprep, theta, batch_obs, dev):
         """Tests that the device gradient of an arbitrary Euler-angle-parameterized gate is
         correct."""
         params = np.array([theta, theta**3, np.sqrt(2) * theta])
 
-        with qml.tape.QuantumTape() as tape:
-            stateprep(np.array([1.0, -1.0], requires_grad=False) / np.sqrt(2), wires=0)
-            qml.Rot(*params, wires=[0])
-            qml.expval(qml.PauliZ(0))
+        qs = QuantumScript(
+            [
+                stateprep(np.array([1.0, -1.0], requires_grad=False) / np.sqrt(2), wires=0),
+                qml.Rot(*params, wires=[0]),
+            ],
+            [qml.expval(qml.PauliZ(0))],
+            trainable_params=[1, 2, 3],
+        )
 
-        tape.trainable_params = {1, 2, 3}
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        calculated_val = dev.adjoint_jacobian(tape)
+        calculated_val = dev.compute_derivatives(qs, config)
 
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         # compare to finite differences
-        tapes, fn = qml.gradients.param_shift(tape)
+        tapes, fn = qml.gradients.param_shift(qs)
         numeric_val = fn(qml.execute(tapes, dev, None))
         assert np.allclose(calculated_val, numeric_val, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("par", [1, -2, 1.623, -0.051, 0])  # integers, floats, zero
-    def test_ry_gradient(self, par, tol, dev):
-        """Test that the gradient of the RY gate matches the exact analytic formula."""
-        with qml.tape.QuantumTape() as tape:
-            qml.RY(par, wires=[0])
-            qml.expval(qml.PauliX(0))
-
-        tape.trainable_params = {0}
+    @pytest.mark.parametrize("param", [1, -2, 1.623, -0.051, 0])  # integers, floats, zero
+    @pytest.mark.parametrize(
+        "rotation, meas, expected_func",
+        [
+            (qml.RY, qml.PauliX, lambda x: np.cos(x)),  # pylint: disable=unnecessary-lambda
+            (qml.RX, qml.PauliZ, lambda x: -np.sin(x)),  # pylint: disable=unnecessary-lambda
+        ],
+    )
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_r_gradient(
+        self, tol, param, rotation, meas, expected_func, batch_obs, dev
+    ):  # pylint: disable=too-many-arguments
+        """Test for the gradient of the rotation gate matches the known formula."""
 
-        # gradients
-        exact = np.cos(par)
-        grad_A = dev.adjoint_jacobian(tape)
+        qs = QuantumScript(
+            [rotation(param, wires=0)],
+            [qml.expval(meas(0))],
+            trainable_params=[0],
+        )
 
-        # different methods must agree
-        assert np.allclose(grad_A, exact, atol=tol, rtol=0)
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-    def test_rx_gradient(self, tol, dev):
-        """Test that the gradient of the RX gate matches the known formula."""
-        a = 0.7418
+        # circuit jacobians
+        dev_jacobian = dev.compute_derivatives(qs, config)
+        expected_jacobian = expected_func(param)
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(a, wires=0)
-            qml.expval(qml.PauliZ(0))
+    @staticmethod
+    def process_and_execute_multiple_rx(dev, params, meas, batch_obs):
+        """Compute the circuit with multiple RX gates"""
+        qs = QuantumScript(
+            [qml.RX(params[0], wires=0), qml.RX(params[1], wires=1), qml.RX(params[2], wires=2)],
+            meas,
+            trainable_params=[0, 1, 2],
+        )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
         # circuit jacobians
-        dev_jacobian = dev.adjoint_jacobian(tape)
-        expected_jacobian = -np.sin(a)
-        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+        dev_jacobian = dev.compute_derivatives(qs, config)
 
-    def test_multiple_rx_gradient_pauliz(self, tol, dev):
+        return dev_jacobian
+
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_pauliz(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result."""
         params = np.array([np.pi, np.pi / 2, np.pi / 3])
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
-
-            for idx in range(3):
-                qml.expval(qml.PauliZ(idx))
+        meas = [qml.expval(qml.PauliZ(idx)) for idx in range(3)]
 
         # circuit jacobians
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = -np.diag(np.sin(params))
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    def test_multiple_rx_gradient_hermitian(self, tol, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_hermitian(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result
         with Hermitian observable
         """
-        params = np.array([np.pi, np.pi / 2, np.pi / 3])
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
+        params = np.array([np.pi, np.pi / 2, np.pi / 3])
 
-            for idx in range(3):
-                qml.expval(qml.Hermitian([[1, 0], [0, -1]], wires=[idx]))
+        meas = [qml.expval(qml.Hermitian([[1, 0], [0, -1]], wires=[idx])) for idx in range(3)]
 
-        tape.trainable_params = {0, 1, 2}
         # circuit jacobians
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = -np.diag(np.sin(params))
 
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
-    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
-
-    def test_multiple_rx_gradient_expval_hermitian(self, tol, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_expval_hermitian(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result
         with Hermitian observable
         """
         params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
-
+        meas = [
             qml.expval(
                 qml.Hermitian(
                     [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
                     wires=[0, 2],
                 )
             )
+        ]
 
-        tape.trainable_params = {0, 1, 2}
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = np.array(
             [
                 -np.sin(params[0]) * np.cos(params[2]),
@@ -328,37 +316,31 @@ def test_multiple_rx_gradient_expval_hermitian(self, tol, dev):
 
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
-    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
-
-    def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_multiple_rx_gradient_expval_hamiltonian(self, tol, batch_obs, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result
         with Hermitian observable
         """
         params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
 
-        ham = qml.Hamiltonian(
-            [1.0, 0.3, 0.3, 0.4],
-            [
-                qml.PauliX(0) @ qml.PauliX(1),
-                qml.PauliZ(0),
-                qml.PauliZ(1),
-                qml.Hermitian(
-                    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
-                    wires=[0, 2],
-                ),
-            ],
-        )
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(params[0], wires=0)
-            qml.RX(params[1], wires=1)
-            qml.RX(params[2], wires=2)
-
-            qml.expval(ham)
+        meas = [
+            qml.expval(
+                qml.Hamiltonian(
+                    [1.0, 0.3, 0.3, 0.4],
+                    [
+                        qml.PauliX(0) @ qml.PauliX(1),
+                        qml.PauliZ(0),
+                        qml.PauliZ(1),
+                        qml.Hermitian(
+                            [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
+                            wires=[0, 2],
+                        ),
+                    ],
+                )
+            )
+        ]
 
-        tape.trainable_params = {0, 1, 2}
-        dev_jacobian = dev.adjoint_jacobian(tape)
+        dev_jacobian = self.process_and_execute_multiple_rx(dev, params, meas, batch_obs)
         expected_jacobian = (
             0.3 * np.array([-np.sin(params[0]), 0, 0])
             + 0.3 * np.array([0, -np.sin(params[1]), 0])
@@ -374,51 +356,21 @@ def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
 
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
-    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
-
-    @pytest.mark.parametrize("obs", [qml.PauliX, qml.PauliY])
     @pytest.mark.parametrize(
-        "op",
+        "meas",
         [
-            qml.RX(0.4, wires=0),
-            qml.RY(0.6, wires=0),
-            qml.RZ(0.8, wires=0),
-            qml.CRX(1.0, wires=[0, 1]),
-            qml.CRY(2.0, wires=[0, 1]),
-            qml.CRZ(3.0, wires=[0, 1]),
-            qml.Rot(0.2, -0.1, 0.2, wires=0),
+            [qml.expval(qml.PauliX(wires=0)), qml.expval(qml.PauliZ(wires=1))],
+            [qml.expval(qml.PauliY(wires=0)), qml.expval(qml.PauliZ(wires=1))],
+            [
+                qml.expval(
+                    qml.Hermitian(
+                        [[0, 0, 1, 1], [0, 1, 2, 1], [1, 2, 1, 0], [1, 1, 0, 0]],
+                        wires=[0, 1],
+                    )
+                )
+            ],
         ],
     )
-    def test_gradients_pauliz(self, op, obs, dev):
-        """Tests that the gradients of circuits match between the finite difference and device
-        methods."""
-        # op.num_wires and op.num_params must be initialized a priori
-        with qml.tape.QuantumTape() as tape:
-            qml.Hadamard(wires=0)
-            qml.RX(0.543, wires=0)
-            qml.CNOT(wires=[0, 1])
-
-            op  # pylint: disable=pointless-statement
-
-            qml.Rot(1.3, -2.3, 0.5, wires=[0])
-            qml.RZ(-0.5, wires=0)
-            qml.adjoint(qml.RY(0.5, wires=1), lazy=False)
-            qml.CNOT(wires=[0, 1])
-
-            qml.expval(obs(wires=0))
-            qml.expval(qml.PauliZ(wires=1))
-
-        tape.trainable_params = set(range(1, 1 + op.num_params))
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        # pylint: disable=unnecessary-direct-lambda-call
-        grad_F = (lambda t, fn: fn(qml.execute(t, dev, None)))(*qml.gradients.param_shift(tape))
-        grad_D = dev.adjoint_jacobian(tape)
-
-        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
-
     @pytest.mark.parametrize(
         "op",
         [
@@ -431,119 +383,72 @@ def test_gradients_pauliz(self, op, obs, dev):
             qml.Rot(0.2, -0.1, 0.2, wires=0),
         ],
     )
-    def test_gradients_hermitian(self, op, dev):
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_gradients_pauliz_hermitian(self, op, meas, batch_obs, dev):
         """Tests that the gradients of circuits match between the finite difference and device
         methods."""
         # op.num_wires and op.num_params must be initialized a priori
-        with qml.tape.QuantumTape() as tape:
-            qml.Hadamard(wires=0)
-            qml.RX(0.543, wires=0)
-            qml.CNOT(wires=[0, 1])
-
-            op.queue()
-
-            qml.Rot(1.3, -2.3, 0.5, wires=[0])
-            qml.RZ(-0.5, wires=0)
-            qml.adjoint(qml.RY(0.5, wires=1), lazy=False)
-            qml.CNOT(wires=[0, 1])
-
-            qml.expval(
-                qml.Hermitian(
-                    [[0, 0, 1, 1], [0, 1, 2, 1], [1, 2, 1, 0], [1, 1, 0, 0]],
-                    wires=[0, 1],
-                )
-            )
-
-        tape.trainable_params = set(range(1, 1 + op.num_params))
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        # pylint: disable=unnecessary-direct-lambda-call
-        grad_F = (lambda t, fn: fn(qml.execute(t, dev, None)))(*qml.gradients.param_shift(tape))
-        grad_D = dev.adjoint_jacobian(tape)
-
-        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
-
-    def test_gradient_gate_with_multiple_parameters_pauliz(self, dev):
-        """Tests that gates with multiple free parameters yield correct gradients."""
-        x, y, z = [0.5, 0.3, -0.7]
-
-        tape = qml.tape.QuantumScript(
+        qs = QuantumScript(
             [
-                qml.RX(0.4, wires=[0]),
-                qml.Rot(x, y, z, wires=[0]),
-                qml.RY(-0.2, wires=[0]),
+                qml.Hadamard(wires=0),
+                qml.RX(0.543, wires=0),
+                qml.CNOT(wires=[0, 1]),
+                op,
+                qml.Rot(1.3, -2.3, 0.5, wires=[0]),
+                qml.RZ(-0.5, wires=0),
+                qml.adjoint(qml.RY(0.5, wires=1), lazy=False),
+                qml.CNOT(wires=[0, 1]),
             ],
-            [qml.expval(qml.PauliZ(0))],
+            meas,
+            trainable_params=list(range(1, 1 + op.num_params)),
         )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        tape.trainable_params = {1, 2, 3}
+        tol = self.tol_for_allclose(dev.c_dtype)
 
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        grad_D = dev.adjoint_jacobian(tape)
-        tapes, fn = qml.gradients.param_shift(tape)
+        tapes, fn = qml.gradients.param_shift(qs)
         grad_F = fn(qml.execute(tapes, dev, None))
 
-        # gradient has the correct shape and every element is nonzero
-        assert len(grad_D) == 3
-        assert all(isinstance(v, np.ndarray) for v in grad_D)
-        assert np.count_nonzero(grad_D) == 3
-        # the different methods agree
+        # circuit jacobians
+        grad_D = dev.compute_derivatives(qs, config)
         assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
 
-    def test_gradient_gate_with_multiple_parameters_hermitian(self, dev):
-        """Tests that gates with multiple free parameters yield correct gradients."""
-        x, y, z = [0.5, 0.3, -0.7]
-
-        tape = qml.tape.QuantumScript(
+    @pytest.mark.parametrize(
+        "meas",
+        [
+            [qml.expval(qml.PauliZ(0))],
+            [qml.expval(qml.Hermitian([[0, 1], [1, 1]], wires=0))],
             [
-                qml.RX(0.4, wires=[0]),
-                qml.Rot(x, y, z, wires=[0]),
-                qml.RY(-0.2, wires=[0]),
+                qml.expval(
+                    qml.Hamiltonian(
+                        [1.0, 0.3, 0.3],
+                        [qml.PauliX(0) @ qml.PauliX(1), qml.PauliZ(0), qml.PauliZ(1)],
+                    )
+                )
             ],
-            [qml.expval(qml.Hermitian([[0, 1], [1, 1]], wires=0))],
-        )
-
-        tape.trainable_params = {1, 2, 3}
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-
-        grad_D = dev.adjoint_jacobian(tape)
-        tapes, fn = qml.gradients.param_shift(tape)
-        grad_F = fn(qml.execute(tapes, dev, None))
-
-        # gradient has the correct shape and every element is nonzero
-        assert len(grad_D) == 3
-        assert all(isinstance(v, np.ndarray) for v in grad_D)
-        assert np.count_nonzero(grad_D) == 3
-        # the different methods agree
-        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
-
-    def test_gradient_gate_with_multiple_parameters_hamiltonian(self, dev):
+        ],
+    )
+    @pytest.mark.parametrize("batch_obs", [True, False])
+    def test_gradient_gate_with_multiple_parameters(self, meas, batch_obs, dev):
         """Tests that gates with multiple free parameters yield correct gradients."""
         x, y, z = [0.5, 0.3, -0.7]
 
-        ham = qml.Hamiltonian(
-            [1.0, 0.3, 0.3],
-            [qml.PauliX(0) @ qml.PauliX(1), qml.PauliZ(0), qml.PauliZ(1)],
-        )
-
-        tape = qml.tape.QuantumScript(
+        qs = QuantumScript(
             [
                 qml.RX(0.4, wires=[0]),
                 qml.Rot(x, y, z, wires=[0]),
                 qml.RY(-0.2, wires=[0]),
             ],
-            [qml.expval(ham)],
+            meas,
+            trainable_params=[1, 2, 3],
         )
+        config = ExecutionConfig(gradient_method="adjoint", device_options={"batch_obs": batch_obs})
 
-        tape.trainable_params = {1, 2, 3}
-
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = self.tol_for_allclose(dev.c_dtype)
 
-        grad_D = dev.adjoint_jacobian(tape)
-        tapes, fn = qml.gradients.param_shift(tape)
+        # circuit jacobians
+        grad_D = dev.compute_derivatives(qs, config)
+        tapes, fn = qml.gradients.param_shift(qs)
         grad_F = fn(qml.execute(tapes, dev, None))
 
         # gradient has the correct shape and every element is nonzero
@@ -553,101 +458,45 @@ def test_gradient_gate_with_multiple_parameters_hamiltonian(self, dev):
         # the different methods agree
         assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
 
-    def test_use_device_state(self, tol, dev):
-        """Tests that when using the device state, the correct answer is still returned."""
-
-        x, y, z = [0.5, 0.3, -0.7]
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            qml.expval(qml.PauliZ(0))
-
-        tape.trainable_params = {1, 2, 3}
-
-        dM1 = dev.adjoint_jacobian(tape)
-
-        qml.execute([tape], dev, None)
-        dM2 = dev.adjoint_jacobian(tape, use_device_state=True)
-
-        assert np.allclose(dM1, dM2, atol=tol, rtol=0)
-
-    def test_provide_starting_state(self, tol, dev):
-        """Tests provides correct answer when provided starting state."""
-        comm = MPI.COMM_WORLD
-
-        x, y, z = [0.5, 0.3, -0.7]
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            qml.expval(qml.PauliZ(0))
-
-        tape.trainable_params = {1, 2, 3}
-
-        dM1 = dev.adjoint_jacobian(tape)
 
-        if device_name == "lightning.gpu":
-            local_state_vector = dev.state
-            complex_type = np.complex128 if dev.R_DTYPE == np.float64 else np.complex64
-            state_vector = np.zeros(1 << 8).astype(complex_type)
-            comm.Allgather(local_state_vector, state_vector)
-            qml.execute([tape], dev, None)
-            dM2 = dev.adjoint_jacobian(tape, starting_state=state_vector)
-            assert np.allclose(dM1, dM2, atol=tol, rtol=0)
-
-    def test_provide_wrong_starting_state(self, dev):
-        """Tests raise an exception when provided starting state mismatches."""
-        x, y, z = [0.5, 0.3, -0.7]
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            qml.expval(qml.PauliZ(0))
-
-        tape.trainable_params = {1, 2, 3}
+class TestAdjointJacobianQNode:
+    """Test QNode integration with the adjoint_jacobian method"""
 
-        with pytest.raises(
-            qml.QuantumFunctionError,
-            match="The number of qubits of starting_state must be the same as",
-        ):
-            dev.adjoint_jacobian(tape, starting_state=np.ones(7))
+    # def analytic_rotation(self):
+    I = np.eye(2)
+    X = qml.PauliX.compute_matrix()
+    Y = qml.PauliY.compute_matrix()
+    Z = qml.PauliZ.compute_matrix()
 
-    @pytest.mark.skipif(
-        device_name == "lightning.gpu",
-        reason="Adjoint differentiation does not support State measurements.",
-    )
-    def test_state_return_type(self, dev):
-        """Tests raise an exception when the return type is State"""
-        with qml.tape.QuantumTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.state()
+    def Rx(self, theta):
+        r"""One-qubit rotation about the x axis.
 
-        tape.trainable_params = {0}
+        Args:
+            theta (float): rotation angle
+        Returns:
+            array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_x \theta/2}`
+        """
+        return math.cos(theta / 2) * self.I + 1j * math.sin(-theta / 2) * self.X
 
-        with pytest.raises(
-            qml.QuantumFunctionError,
-            match="Adjoint differentiation method does not support measurement StateMP.",
-        ):
-            dev.adjoint_jacobian(tape)
+    def Ry(self, theta):
+        r"""One-qubit rotation about the y axis.
 
+        Args:
+            theta (float): rotation angle
+        Returns:
+            array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_y \theta/2}`
+        """
+        return math.cos(theta / 2) * self.I + 1j * math.sin(-theta / 2) * self.Y
 
-class TestAdjointJacobianQNode:
-    """Test QNode integration with the adjoint_jacobian method"""
+    def Rz(self, theta):
+        r"""One-qubit rotation about the z axis.
 
-    @pytest.fixture(params=fixture_params)
-    def dev(self, request):
-        """Returns a PennyLane device."""
-        return qml.device(
-            device_name,
-            wires=8,
-            mpi=True,
-            c_dtype=request.param[0],
-            batch_obs=request.param[1],
-        )
+        Args:
+            theta (float): rotation angle
+        Returns:
+            array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_z \theta/2}`
+        """
+        return math.cos(theta / 2) * self.I + 1j * math.sin(-theta / 2) * self.Z
 
     def test_finite_shots_error(self):
         """Tests that an error is raised when computing the adjoint diff on a device with finite shots"""
@@ -665,6 +514,11 @@ def circ(x):
 
             qml.grad(circ)(0.1)
 
+    @staticmethod
+    def tol_for_allclose(c_dtype):
+        """Compute the tolerance for allclose"""
+        return 1e-3 if c_dtype == np.complex64 else 1e-7
+
     def test_qnode(self, mocker, dev):
         """Test that specifying diff_method allows the adjoint method to be selected"""
         args = np.array([0.54, 0.1, 0.5], requires_grad=True)
@@ -684,15 +538,15 @@ def circuit(x, y, z):
             return qml.expval(qml.PauliX(0) @ qml.PauliZ(1))
 
         qnode1 = QNode(circuit, dev, diff_method="adjoint")
-        spy = mocker.spy(dev.target_device, "adjoint_jacobian")
+        spy = mocker.spy(dev, "LightningAdjointJacobian")
 
         grad_fn = qml.grad(qnode1)
         grad_A = grad_fn(*args)
 
         spy.assert_called()
 
-        h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         qnode2 = QNode(circuit, dev, diff_method="finite-diff", h=h)
         grad_fn = qml.grad(qnode2)
@@ -726,7 +580,7 @@ def cost(p1, p2):
         zero_state = np.array([1.0, 0.0])
         cost(reused_p, other_p)
 
-        spy = mocker.spy(dev.target_device, "adjoint_jacobian")
+        spy = mocker.spy(dev, "LightningAdjointJacobian")
 
         # analytic gradient
         grad_fn = qml.grad(cost)
@@ -737,18 +591,34 @@ def cost(p1, p2):
         # manual gradient
         grad_true0 = (
             expZ(
-                Rx(reused_p) @ Rz(other_p) @ Ry(reused_p + np.pi / 2) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p + np.pi / 2)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
             - expZ(
-                Rx(reused_p) @ Rz(other_p) @ Ry(reused_p - np.pi / 2) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p - np.pi / 2)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
         ) / 2
         grad_true1 = (
             expZ(
-                Rx(reused_p + np.pi / 2) @ Rz(other_p) @ Ry(reused_p) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p + np.pi / 2)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
             - expZ(
-                Rx(reused_p - np.pi / 2) @ Rz(other_p) @ Ry(reused_p) @ Rx(extra_param) @ zero_state
+                self.Rx(reused_p - np.pi / 2)
+                @ self.Rz(other_p)
+                @ self.Ry(reused_p)
+                @ self.Rx(extra_param)
+                @ zero_state
             )
         ) / 2
         expected = grad_true0 + grad_true1  # product rule
@@ -765,10 +635,10 @@ def circuit(params):
             qml.Rot(params[1], params[0], 2 * params[0], wires=[0])
             return qml.expval(qml.PauliX(0))
 
-        spy_analytic = mocker.spy(dev.target_device, "adjoint_jacobian")
+        spy_analytic = mocker.spy(dev, "LightningAdjointJacobian")
 
-        h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         cost = QNode(circuit, dev, diff_method="finite-diff", h=h)
 
@@ -798,7 +668,7 @@ def f(params1, params2):
             qml.RY(tf.cos(params2), wires=[0])
             return qml.expval(qml.PauliZ(0))
 
-        if dev.R_DTYPE == np.float32:
+        if dev.r_dtype == np.float32:
             tf_r_dtype = tf.float32
         else:
             tf_r_dtype = tf.float64
@@ -806,8 +676,8 @@ def f(params1, params2):
         params1 = tf.Variable(0.3, dtype=tf_r_dtype)
         params2 = tf.Variable(0.4, dtype=tf_r_dtype)
 
-        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         qnode1 = QNode(f, dev, interface="tf", diff_method="adjoint")
         qnode2 = QNode(f, dev, interface="tf", diff_method="finite-diff", h=h)
@@ -839,7 +709,7 @@ def f(params1, params2):
         params1 = torch.tensor(0.3, requires_grad=True)
         params2 = torch.tensor(0.4, requires_grad=True)
 
-        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
 
         qnode1 = QNode(f, dev, interface="torch", diff_method="adjoint")
         qnode2 = QNode(f, dev, interface="torch", diff_method="finite-diff", h=h)
@@ -861,7 +731,7 @@ def test_interface_jax(self, dev):
         jax interface"""
 
         jax = pytest.importorskip("jax")
-        if dev.R_DTYPE == np.float64:
+        if dev.c_dtype == np.complex128:
             from jax import config  # pylint: disable=import-outside-toplevel
 
             config.update("jax_enable_x64", True)
@@ -872,11 +742,13 @@ def f(params1, params2):
             qml.RY(jax.numpy.cos(params2), wires=[0])
             return qml.expval(qml.PauliZ(0))
 
-        params1 = jax.numpy.array(0.3, dev.R_DTYPE)
-        params2 = jax.numpy.array(0.4, dev.R_DTYPE)
+        r_dtype = np.float32 if dev.c_dtype == np.complex64 else np.float64
+
+        params1 = jax.numpy.array(0.3, r_dtype)
+        params2 = jax.numpy.array(0.4, r_dtype)
 
-        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
-        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        h = self.tol_for_allclose(dev.c_dtype)
+        tol = self.tol_for_allclose(dev.c_dtype)
 
         qnode_adjoint = QNode(f, dev, interface="jax", diff_method="adjoint")
         qnode_fd = QNode(f, dev, interface="jax", diff_method="finite-diff", h=h)
@@ -1379,8 +1251,8 @@ def test_qubit_unitary(dev, n_targets):
     """Tests that ``qml.QubitUnitary`` can be included in circuits differentiated with the adjoint method."""
     n_wires = len(dev.wires)
     dev_def = qml.device("default.qubit", wires=n_wires)
-    h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-    c_dtype = np.complex64 if dev.R_DTYPE == np.float32 else np.complex128
+    h = 1e-3 if dev.c_dtype == np.complex64 else 1e-7
+    c_dtype = dev.c_dtype
 
     np.random.seed(1337)
     par = 2 * np.pi * np.random.rand(n_wires)
@@ -1427,8 +1299,8 @@ def test_diff_qubit_unitary(dev, n_targets):
     """Tests that ``qml.QubitUnitary`` can be differentiated with the adjoint method."""
     n_wires = len(dev.wires)
     dev_def = qml.device("default.qubit", wires=n_wires)
-    h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
-    c_dtype = np.complex64 if dev.R_DTYPE == np.float32 else np.complex128
+    h = 1e-3 if dev.c_dtype == np.complex64 else 1e-7
+    c_dtype = dev.c_dtype
 
     np.random.seed(1337)
     par = 2 * np.pi * np.random.rand(n_wires)
diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
index 17d91cd2d..5987626f1 100644
--- a/mpitests/test_apply.py
+++ b/mpitests/test_apply.py
@@ -34,14 +34,17 @@
 )
 
 
-def create_random_init_state(numWires, R_DTYPE, seed_value=48):
+def create_random_init_state(numWires, c_dtype, seed_value=48):
     """Returns a random initial state of a certain type."""
     np.random.seed(seed_value)
-    num_elements = 1 << numWires
-    init_state = np.random.rand(num_elements).astype(R_DTYPE) + 1j * np.random.rand(
+
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
+    num_elements = 2**numWires
+    init_state = np.random.rand(num_elements).astype(r_dtype) + 1j * np.random.rand(
         num_elements
-    ).astype(R_DTYPE)
-    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(R_DTYPE)
+    ).astype(r_dtype)
+    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(r_dtype)
     init_state = init_state / scale_sum
     return init_state
 
@@ -54,16 +57,13 @@ def apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires):
     num_global_wires = commSize.bit_length() - 1
     num_local_wires = num_wires - num_global_wires
 
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
+    c_dtype = dev_mpi.c_dtype
 
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     comm.Scatter(state_vector, local_state_vector, root=0)
@@ -84,45 +84,6 @@ def circuit(*params):
     assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
-def apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires):
-    """Wrapper applying a parametric gate with the apply method."""
-    num_wires = numQubits
-    comm = MPI.COMM_WORLD
-    commSize = comm.Get_size()
-    num_global_wires = commSize.bit_length() - 1
-    num_local_wires = num_wires - num_global_wires
-
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
-
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
-
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
-    comm.Bcast(state_vector, root=0)
-
-    comm.Scatter(state_vector, local_state_vector, root=0)
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
-
-    @qml.qnode(dev_cpu)
-    def circuit(*params):
-        qml.StatePrep(state_vector, wires=range(num_wires))
-        operation(*params, wires=Wires)
-        return qml.state()
-
-    expected_output_cpu = np.array(circuit(*par)).astype(c_dtype)
-    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
-
-    dev_mpi.syncH2D(local_state_vector)
-    dev_mpi.apply([operation(*par, wires=Wires)])
-    dev_mpi.syncD2H(local_state_vector)
-
-    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
-
-
 def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
     """Wrapper applying a non-parametric gate with QNode function."""
     num_wires = numQubits
@@ -131,16 +92,13 @@ def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
     num_global_wires = commSize.bit_length() - 1
     num_local_wires = num_wires - num_global_wires
 
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
+    c_dtype = dev_mpi.c_dtype
 
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     comm.Scatter(state_vector, local_state_vector, root=0)
@@ -161,45 +119,6 @@ def circuit():
     assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
-def apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires):
-    """Wrapper applying a non-parametric gate with the apply method."""
-    num_wires = numQubits
-    comm = MPI.COMM_WORLD
-    commSize = comm.Get_size()
-    num_global_wires = commSize.bit_length() - 1
-    num_local_wires = num_wires - num_global_wires
-
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
-
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
-
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
-    comm.Bcast(state_vector, root=0)
-
-    comm.Scatter(state_vector, local_state_vector, root=0)
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
-
-    @qml.qnode(dev_cpu)
-    def circuit():
-        qml.StatePrep(state_vector, wires=range(num_wires))
-        operation(wires=Wires)
-        return qml.state()
-
-    expected_output_cpu = np.array(circuit()).astype(c_dtype)
-    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
-
-    dev_mpi.syncH2D(local_state_vector)
-    dev_mpi.apply([operation(wires=Wires)])
-    dev_mpi.syncD2H(local_state_vector)
-
-    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
-
-
 class TestApply:  # pylint: disable=missing-function-docstring,too-many-arguments
     """Tests whether the device can apply supported quantum gates."""
 
@@ -220,13 +139,11 @@ def dev_mpi(self, request):
     @pytest.mark.parametrize("Wires", [0, 1, numQubits - 2, numQubits - 1])
     def test_apply_operation_single_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CNOT, qml.SWAP, qml.CY, qml.CZ])
     @pytest.mark.parametrize("Wires", [[0, 1], [numQubits - 2, numQubits - 1], [0, numQubits - 1]])
     def test_apply_operation_two_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -240,7 +157,6 @@ def test_apply_operation_two_wire_nonparam(self, tol, operation, Wires, dev_mpi)
     )
     def test_apply_operation_three_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -254,7 +170,6 @@ def test_apply_operation_three_wire_nonparam(self, tol, operation, Wires, dev_mp
     )
     def test_apply_operation_three_wire_qnode_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.PhaseShift, qml.RX, qml.RY, qml.RZ])
     @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
@@ -263,7 +178,6 @@ def test_apply_operation_1gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize("operation", [qml.Rot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
@@ -272,7 +186,6 @@ def test_apply_operation_1gatequbit_3param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CRot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
@@ -281,7 +194,6 @@ def test_apply_operation_1gatequbit_3param_cgate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize(
         "operation",
@@ -304,7 +216,6 @@ def test_apply_operation_2gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize(
         "operation",
@@ -323,7 +234,6 @@ def test_apply_operation_4gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     # BasisState test
     @pytest.mark.parametrize("operation", [qml.BasisState])
@@ -337,17 +247,17 @@ def test_state_prep(self, tol, operation, index, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
 
-        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
-        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+        state_vector = np.zeros(2**num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -399,17 +309,17 @@ def test_qubit_state_prep(self, tol, par, Wires, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
 
-        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
-        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+        state_vector = np.zeros(2**num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -435,17 +345,17 @@ def test_dev_reset(self, tol, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
 
-        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
-        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+        state_vector = np.zeros(2**num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -462,10 +372,10 @@ def circuit():
         expected_output_cpu = cpu_qnode().astype(c_dtype)
         comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
 
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state()
 
         gpumpi_qnode = qml.QNode(circuit, dev_mpi)
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state()
 
         local_state_vector = gpumpi_qnode()
         assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
@@ -474,8 +384,8 @@ def circuit():
 class TestSparseHamExpval:  # pylint: disable=too-few-public-methods,missing-function-docstring
     """Tests sparse hamiltonian expectation values."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sparse_hamiltonian_expectation(self, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sparse_hamiltonian_expectation(self, c_dtype):
         comm = MPI.COMM_WORLD
         commSize = comm.Get_size()
         num_global_wires = commSize.bit_length() - 1
@@ -496,32 +406,38 @@ def test_sparse_hamiltonian_expectation(self, C_DTYPE):
                 0.3 + 0.3j,
                 0.3 + 0.5j,
             ],
-            dtype=C_DTYPE,
+            dtype=c_dtype,
         )
 
-        local_state_vector = np.zeros(1 << num_local_wires).astype(C_DTYPE)
+        state_vector /= np.linalg.norm(state_vector)
+
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
         comm.Scatter(state_vector, local_state_vector, root=0)
 
-        dev_gpu = qml.device("lightning.gpu", wires=3, mpi=False, c_dtype=C_DTYPE)
-        dev_mpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=C_DTYPE)
+        H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3))
 
-        dev_mpi.syncH2D(local_state_vector)
-        dev_gpu.syncH2D(state_vector)
+        def circuit():
+            qml.StatePrep(state_vector, wires=range(3))
+            return qml.expval(H_sparse)
 
-        H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3))
+        dev_gpu = qml.device("lightning.gpu", wires=3, mpi=False, c_dtype=c_dtype)
+        gpu_qnode = qml.QNode(circuit, dev_gpu)
+        expected_output_gpu = gpu_qnode()
+        comm.Bcast(np.array(expected_output_gpu), root=0)
 
-        comm.Barrier()
+        dev_mpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=c_dtype)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+        expected_output_mpi = mpi_qnode()
 
-        res = dev_mpi.expval(H_sparse)
-        expected = dev_gpu.expval(H_sparse)
+        comm.Barrier()
 
-        assert np.allclose(res, expected)
+        assert np.allclose(expected_output_mpi, expected_output_gpu)
 
 
 class TestExpval:
     """Tests that expectation values are properly calculated or that the proper errors are raised."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
     @pytest.mark.parametrize(
         "operation",
         [
@@ -533,7 +449,7 @@ class TestExpval:
         ],
     )
     @pytest.mark.parametrize("wires", [0, 1, 2, numQubits - 3, numQubits - 2, numQubits - 1])
-    def test_expval_single_wire_no_parameters(self, tol, operation, wires, C_DTYPE):
+    def test_expval_single_wire_no_parameters(self, tol, operation, wires, c_dtype):
         """Tests that expectation values are properly calculated for single-wire observables without parameters."""
         num_wires = numQubits
         comm = MPI.COMM_WORLD
@@ -541,14 +457,14 @@ def test_expval_single_wire_no_parameters(self, tol, operation, wires, C_DTYPE):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        dev_mpi = qml.device("lightning.gpu", wires=numQubits, mpi=True, c_dtype=C_DTYPE)
+        dev_mpi = qml.device("lightning.gpu", wires=numQubits, mpi=True, c_dtype=c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
         comm.Bcast(state_vector, root=0)
 
-        local_state_vector = np.zeros(1 << num_local_wires).astype(C_DTYPE)
+        local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
         comm.Scatter(state_vector, local_state_vector, root=0)
-        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
 
         def circuit():
             qml.StatePrep(state_vector, wires=range(num_wires))
@@ -563,7 +479,7 @@ def circuit():
 
         assert np.allclose(expected_output_mpi, expected_output_cpu, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
     @pytest.mark.parametrize(
         "obs",
         [
@@ -575,12 +491,12 @@ def circuit():
             qml.PauliZ(numQubits - 2) @ qml.PauliZ(numQubits - 1),
         ],
     )
-    def test_expval_multiple_obs(self, obs, tol, C_DTYPE):
+    def test_expval_multiple_obs(self, obs, tol, c_dtype):
         """Test expval with Hamiltonian"""
         num_wires = numQubits
 
-        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         def circuit():
             qml.RX(0.4, wires=[0])
@@ -592,7 +508,7 @@ def circuit():
 
         assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
     @pytest.mark.parametrize(
         "obs, coeffs",
         [
@@ -620,14 +536,14 @@ def circuit():
             ),
         ],
     )
-    def test_expval_hamiltonian(self, obs, coeffs, tol, C_DTYPE):
+    def test_expval_hamiltonian(self, obs, coeffs, tol, c_dtype):
         """Test expval with Hamiltonian"""
         num_wires = numQubits
 
         ham = qml.Hamiltonian(coeffs, obs)
 
-        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         def circuit():
             qml.RX(0.4, wires=[0])
@@ -665,14 +581,14 @@ def circuit():
 class TestGenerateSample:
     """Tests that samples are properly calculated."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sample_dimensions(self, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sample_dimensions(self, c_dtype):
         """Tests if the samples returned by sample have
         the correct dimensions
         """
         num_wires = numQubits
 
-        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         ops = [qml.RX(1.5708, wires=[0]), qml.RX(1.5708, wires=[1])]
 
@@ -697,14 +613,14 @@ def test_sample_dimensions(self, C_DTYPE):
 
         assert np.array_equal(s3.shape, (shots,))
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sample_values(self, tol, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sample_values(self, tol, c_dtype):
         """Tests if the samples returned by sample have
         the correct values
         """
         num_wires = numQubits
 
-        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype)
 
         shots = qml.measurements.Shots(1000)
         ops = [qml.RX(1.5708, wires=[0])]
@@ -716,17 +632,17 @@ def test_sample_values(self, tol, C_DTYPE):
         # they square to 1
         assert np.allclose(s1**2, 1, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_sample_values_qnode(self, tol, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_sample_values_qnode(self, tol, c_dtype):
         """Tests if the samples returned by sample have
         the correct values
         """
         num_wires = numQubits
 
         dev_mpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state()
 
         @qml.qnode(dev_mpi)
         def circuit():
@@ -737,15 +653,15 @@ def circuit():
         # they square to 1
         assert np.allclose(circuit() ** 2, 1, atol=tol, rtol=0)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_multi_samples_return_correlated_results(self, C_DTYPE):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_multi_samples_return_correlated_results(self, c_dtype):
         """Tests if the samples returned by the sample function have
         the correct dimensions
         """
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         @qml.qnode(dev_gpumpi)
@@ -758,13 +674,13 @@ def circuit():
 
         assert np.array_equal(outcomes[0], outcomes[1])
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_paulix_pauliy(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_paulix_pauliy(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliX and PauliY works correctly"""
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
@@ -800,13 +716,13 @@ def circuit():
         ) / 16
         assert np.allclose(var, expected, atol=tol)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_pauliz_hadamard(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_pauliz_hadamard(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliZ and PauliY and hadamard works correctly"""
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
@@ -846,13 +762,13 @@ def circuit():
 class TestTensorVar:
     """Test tensor variance measurements."""
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_paulix_pauliy(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_paulix_pauliy(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliX and PauliY works correctly"""
         num_wires = 3
 
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
@@ -880,12 +796,12 @@ def circuit():
         ) / 16
         assert np.allclose(res, expected, atol=tol)
 
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
-    def test_pauliz_hadamard(self, C_DTYPE, tol=TOL_STOCHASTIC):
+    @pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+    def test_pauliz_hadamard(self, c_dtype, tol=TOL_STOCHASTIC):
         """Test that a tensor product involving PauliZ and PauliY and hadamard works correctly"""
         num_wires = 3
         dev_gpumpi = qml.device(
-            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=c_dtype
         )
 
         theta = 0.432
diff --git a/mpitests/test_device.py b/mpitests/test_device.py
index 03a188011..dd783dbee 100644
--- a/mpitests/test_device.py
+++ b/mpitests/test_device.py
@@ -38,13 +38,13 @@ def test_create_device():
 
 
 def test_unsupported_mpi_buf_size():
-    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+    with pytest.raises(ValueError, match="Unsupported mpi_buf_size value"):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=-1)
-    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+    with pytest.raises(ValueError, match="Unsupported mpi_buf_size value"):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=3)
-    with pytest.warns(
-        RuntimeWarning,
-        match="The MPI buffer size is larger than the local state vector size",
+    with pytest.raises(
+        RuntimeError,
+        match="The MPI buffer size is larger than the local state vector size.",
     ):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=2**4)
     with pytest.raises(
diff --git a/mpitests/test_expval.py b/mpitests/test_expval.py
index d020471c0..3ca73cd82 100644
--- a/mpitests/test_expval.py
+++ b/mpitests/test_expval.py
@@ -22,114 +22,260 @@
 from conftest import PHI, THETA, VARPHI, device_name
 from mpi4py import MPI
 
+numQubits = 8
 
-@pytest.mark.parametrize("theta, phi", list(zip(THETA, PHI)))
-class TestExpval:
-    """Test expectation values"""
 
-    def test_identity_expectation(self, theta, phi, tol):
-        """Test that identity expectation value (i.e. the trace) is 1"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+def create_random_init_state(numWires, c_dtype, seed_value=48):
+    """Returns a random initial state of a certain type."""
+    np.random.seed(seed_value)
 
-        O1 = qml.Identity(wires=[0])
-        O2 = qml.Identity(wires=[1])
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
 
-        dev.apply(
-            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-        )
+    num_elements = 2**numWires
+    init_state = np.random.rand(num_elements).astype(r_dtype) + 1j * np.random.rand(
+        num_elements
+    ).astype(r_dtype)
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        assert np.allclose(res, np.array([1, 1]), tol)
+    init_state = init_state / np.linalg.norm(init_state)
+    return init_state
 
-    def test_pauliz_expectation(self, theta, phi, tol):
-        """Test that PauliZ expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
 
-        O1 = qml.PauliZ(wires=[0])
-        O2 = qml.PauliZ(wires=[1])
+def apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires):
+    """Wrapper applying a parametric gate with QNode function."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
 
-        dev.apply(
-            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-        )
+    c_dtype = dev_mpi.c_dtype
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        assert np.allclose(res, np.array([np.cos(theta), np.cos(theta) * np.cos(phi)]), tol)
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
 
-    def test_paulix_expectation(self, theta, phi, tol):
-        """Test that PauliX expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
+    comm.Bcast(state_vector, root=0)
 
-        O1 = qml.PauliX(wires=[0])
-        O2 = qml.PauliX(wires=[1])
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
 
-        dev.apply(
-            [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-        )
+    def circuit(*params):
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(*params, wires=Wires)
+        return qml.state()
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    expected_output_cpu = cpu_qnode(*par).astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_state_vector = mpi_qnode(*par)
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
+    """Wrapper applying a non-parametric gate with QNode function."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
+
+    c_dtype = dev_mpi.c_dtype
+
+    expected_output_cpu = np.zeros(2**num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(2**num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(2**num_local_wires).astype(c_dtype)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
+    comm.Bcast(state_vector, root=0)
 
-        res = np.array([dev.expval(O1), dev.expval(O2)], dtype=dev.C_DTYPE)
-        assert np.allclose(
-            res,
-            np.array([np.sin(theta) * np.sin(phi), np.sin(phi)], dtype=dev.C_DTYPE),
-            tol * 10,
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+    def circuit():
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(wires=Wires)
+        return qml.state()
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    expected_output_cpu = cpu_qnode().astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_state_vector = mpi_qnode()
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+@pytest.mark.parametrize("c_dtype", [np.complex128, np.complex64])
+@pytest.mark.parametrize("batch_obs", [True, False])
+class TestExpval:
+    """Tests that expectation values are properly calculated or that the proper errors are raised."""
+
+    @pytest.mark.parametrize(
+        "operation",
+        [
+            qml.PauliX,
+            qml.PauliY,
+            qml.PauliZ,
+            qml.Hadamard,
+            qml.Identity,
+        ],
+    )
+    @pytest.mark.parametrize("wires", [0, 1, 2, numQubits - 2, numQubits - 1])
+    def test_expval_single_wire_no_parameters(self, tol, operation, wires, c_dtype, batch_obs):
+        """Tests that expectation values are properly calculated for single-wire observables without parameters."""
+        num_wires = numQubits
+        comm = MPI.COMM_WORLD
+
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=numQubits, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
         )
 
-    def test_pauliy_expectation(self, theta, phi, tol):
-        """Test that PauliY expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
 
-        O1 = qml.PauliY(wires=[0])
-        O2 = qml.PauliY(wires=[1])
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
+        comm.Bcast(state_vector, root=0)
 
-        dev.apply(
-            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        def circuit():
+            qml.StatePrep(state_vector, wires=range(num_wires))
+            return qml.expval(operation(wires))
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        expected_output_cpu = cpu_qnode()
+        comm.Bcast(np.array(expected_output_cpu), root=0)
+
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+        expected_output_mpi = mpi_qnode()
+
+        assert np.allclose(expected_output_mpi, expected_output_cpu, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize(
+        "obs",
+        [
+            qml.PauliX(0) @ qml.PauliZ(1),
+            qml.PauliX(0) @ qml.PauliZ(numQubits - 1),
+            qml.PauliX(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+            qml.PauliZ(0) @ qml.PauliZ(1),
+            qml.PauliZ(0) @ qml.PauliZ(numQubits - 1),
+            qml.PauliZ(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+        ],
+    )
+    def test_expval_multiple_obs(self, obs, tol, c_dtype, batch_obs):
+        """Test expval with Hamiltonian"""
+        num_wires = numQubits
+
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
         )
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        assert np.allclose(res, np.array([0, -np.cos(theta) * np.sin(phi)]), tol)
+        def circuit():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[num_wires - 1])
+            return qml.expval(obs)
 
-    def test_hadamard_expectation(self, theta, phi, tol):
-        """Test that Hadamard expectation value is correct"""
-        dev = qml.device(device_name, mpi=True, wires=3)
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
+
+    @pytest.mark.parametrize(
+        "obs, coeffs",
+        [
+            ([qml.PauliX(0) @ qml.PauliZ(1)], [0.314]),
+            ([qml.PauliX(0) @ qml.PauliZ(numQubits - 1)], [0.314]),
+            ([qml.PauliZ(0) @ qml.PauliZ(1)], [0.314]),
+            ([qml.PauliZ(0) @ qml.PauliZ(numQubits - 1)], [0.314]),
+            (
+                [qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)],
+                [0.314, 0.2],
+            ),
+            (
+                [
+                    qml.PauliX(0) @ qml.PauliZ(numQubits - 1),
+                    qml.PauliZ(0) @ qml.PauliZ(1),
+                ],
+                [0.314, 0.2],
+            ),
+            (
+                [
+                    qml.PauliX(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+                    qml.PauliZ(0) @ qml.PauliZ(1),
+                ],
+                [0.314, 0.2],
+            ),
+        ],
+    )
+    def test_expval_hamiltonian(self, obs, coeffs, tol, c_dtype, batch_obs):
+        """Test expval with Hamiltonian"""
+        num_wires = numQubits
+
+        ham = qml.Hamiltonian(coeffs, obs)
+
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
+        )
+
+        def circuit():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[numQubits - 1])
+            return qml.expval(ham)
 
-        O1 = qml.Hadamard(wires=[0])
-        O2 = qml.Hadamard(wires=[1])
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
 
-        dev.apply(
-            [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
-            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
+
+    def test_expval_non_pauli_word_hamiltionian(self, tol, c_dtype, batch_obs):
+        """Tests expectation values of non-Pauli word Hamiltonians."""
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=3, mpi=True, c_dtype=c_dtype, batch_obs=batch_obs
         )
+        dev_cpu = qml.device("lightning.qubit", wires=3)
+
+        theta = 0.432
+        phi = 0.123
+        varphi = -0.543
+
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.expval(0.5 * qml.Hadamard(2))
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
 
-        res = np.array([dev.expval(O1), dev.expval(O2)])
-        expected = np.array(
-            [
-                np.sin(theta) * np.sin(phi) + np.cos(theta),
-                np.cos(theta) * np.cos(phi) + np.sin(phi),
-            ]
-        ) / np.sqrt(2)
-        assert np.allclose(res, expected, tol)
-
-    @pytest.mark.parametrize("n_wires", range(1, 8))
-    def test_hermitian_expectation(self, n_wires, theta, phi, tol):
+    @pytest.mark.parametrize("theta, phi", list(zip(THETA, PHI)))
+    @pytest.mark.parametrize("n_wires", range(1, numQubits))
+    def test_hermitian_expectation(self, n_wires, theta, phi, tol, c_dtype, batch_obs):
         """Test that Hadamard expectation value is correct"""
-        n_qubits = 7
+        n_qubits = numQubits - 1
         dev_def = qml.device("default.qubit", wires=n_qubits)
-        dev = qml.device(device_name, mpi=True, wires=n_qubits)
+        dev = qml.device(
+            device_name, mpi=True, wires=n_qubits, c_dtype=c_dtype, batch_obs=batch_obs
+        )
         comm = MPI.COMM_WORLD
 
         m = 2**n_wires
         U = np.random.rand(m, m) + 1j * np.random.rand(m, m)
         U = U + np.conj(U.T)
-        U = U.astype(dev.C_DTYPE)
+        U = U.astype(dev.c_dtype)
         comm.Bcast(U, root=0)
         obs = qml.Hermitian(U, wires=range(n_wires))
 
         init_state = np.random.rand(2**n_qubits) + 1j * np.random.rand(2**n_qubits)
-        init_state /= np.sqrt(np.dot(np.conj(init_state), init_state))
-        init_state = init_state.astype(dev.C_DTYPE)
+        init_state = init_state / np.linalg.norm(init_state)
+        init_state = init_state.astype(dev.c_dtype)
         comm.Bcast(init_state, root=0)
 
         def circuit():
@@ -250,69 +396,39 @@ def circuit(x, y):
 class TestTensorExpval:
     """Test tensor expectation values"""
 
-    def test_paulix_pauliy(self, theta, phi, varphi, tol):
+    @pytest.mark.parametrize(
+        "obs,expected",
+        [
+            (qml.PauliX(0) @ qml.PauliY(2), "PXPY"),
+            (qml.PauliZ(0) @ qml.Identity(1) @ qml.PauliZ(2), "PZIPZ"),
+            (qml.PauliZ(0) @ qml.Hadamard(1) @ qml.PauliY(2), "PZHPY"),
+        ],
+    )
+    def test_tensor(self, theta, phi, varphi, obs, expected, tol):
         """Test that a tensor product involving PauliX and PauliY works
         correctly"""
         dev = qml.device(device_name, mpi=True, wires=3)
-        obs = qml.PauliX(0) @ qml.PauliY(2)
-
-        dev.apply(
-            [
-                qml.RX(theta, wires=[0]),
-                qml.RX(phi, wires=[1]),
-                qml.RX(varphi, wires=[2]),
-                qml.CNOT(wires=[0, 1]),
-                qml.CNOT(wires=[1, 2]),
-            ],
-            rotations=obs.diagonalizing_gates(),
-        )
-        res = dev.expval(obs)
-
-        expected = np.sin(theta) * np.sin(phi) * np.sin(varphi)
-
-        assert np.allclose(res, expected, atol=tol)
-
-    def test_pauliz_identity(self, theta, phi, varphi, tol):
-        """Test that a tensor product involving PauliZ and Identity works
-        correctly"""
-        dev = qml.device(device_name, mpi=True, wires=3)
-        obs = qml.PauliZ(0) @ qml.Identity(1) @ qml.PauliZ(2)
-
-        dev.apply(
-            [
-                qml.RX(theta, wires=[0]),
-                qml.RX(phi, wires=[1]),
-                qml.RX(varphi, wires=[2]),
-                qml.CNOT(wires=[0, 1]),
-                qml.CNOT(wires=[1, 2]),
-            ],
-            rotations=obs.diagonalizing_gates(),
-        )
-
-        res = dev.expval(obs)
-
-        expected = np.cos(varphi) * np.cos(phi)
 
-        assert np.allclose(res, expected, tol)
-
-    def test_pauliz_hadamard_pauliy(self, theta, phi, varphi, tol):
-        """Test that a tensor product involving PauliZ and PauliY and Hadamard
-        works correctly"""
-        dev = qml.device(device_name, mpi=True, wires=3)
-        obs = qml.PauliZ(0) @ qml.Hadamard(1) @ qml.PauliY(2)
-
-        dev.apply(
-            [
-                qml.RX(theta, wires=[0]),
-                qml.RX(phi, wires=[1]),
-                qml.RX(varphi, wires=[2]),
-                qml.CNOT(wires=[0, 1]),
-                qml.CNOT(wires=[1, 2]),
-            ],
-            rotations=obs.diagonalizing_gates(),
-        )
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.expval(obs)
 
-        res = dev.expval(obs)
-        expected = -(np.cos(varphi) * np.sin(phi) + np.sin(varphi) * np.cos(theta)) / np.sqrt(2)
+        mpi_qnode = qml.QNode(circuit, dev)
+        res = mpi_qnode()
+
+        if expected == "PXPY":
+            expected_val = np.sin(theta) * np.sin(phi) * np.sin(varphi)
+        elif expected == "PZIPZ":
+            expected_val = np.cos(varphi) * np.cos(phi)
+        elif expected == "PZHPY":
+            expected_val = -(
+                np.cos(varphi) * np.sin(phi) + np.sin(varphi) * np.cos(theta)
+            ) / np.sqrt(2)
+        else:
+            expected_val = 0
 
-        assert np.allclose(res, expected, tol)
+        assert np.allclose(res, expected_val, atol=tol)
diff --git a/mpitests/test_probs.py b/mpitests/test_probs.py
index b2f57f733..ed9ab9b9c 100644
--- a/mpitests/test_probs.py
+++ b/mpitests/test_probs.py
@@ -23,27 +23,31 @@
 numQubits = 8
 
 
-def create_random_init_state(numWires, R_DTYPE, seed_value=48):
+def create_random_init_state(numWires, c_dtype, seed_value=48):
+    """Returns a random initial state of a certain type."""
     np.random.seed(seed_value)
-    num_elements = 1 << numWires
-    init_state = np.random.rand(num_elements).astype(R_DTYPE) + 1j * np.random.rand(
+
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
+    num_elements = 2**numWires
+    init_state = np.random.rand(num_elements).astype(r_dtype) + 1j * np.random.rand(
         num_elements
-    ).astype(R_DTYPE)
-    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(R_DTYPE)
-    init_state = init_state / scale_sum
+    ).astype(r_dtype)
+
+    init_state = init_state / np.linalg.norm(init_state)
     return init_state
 
 
-def apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE):
+def apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype):
     num_wires = numQubits
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     commSize = comm.Get_size()
 
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     def circuit():
@@ -58,15 +62,16 @@ def circuit():
     local_probs = mpi_qnode()
 
     recv_counts = comm.gather(len(local_probs), root=0)
-
     comm.Barrier()
 
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
     if rank == 0:
-        probs_mpi = np.zeros(1 << len(Wires)).astype(dev_mpi.R_DTYPE)
-        displacements = [i for i in range(commSize)]
+        probs_mpi = np.zeros(2 ** len(Wires)).astype(r_dtype)
     else:
         probs_mpi = None
         probs_cpu = None
+
     comm.Barrier()
     comm.Gatherv(local_probs, [probs_mpi, recv_counts], root=0)
 
@@ -75,16 +80,16 @@ def circuit():
     comm.Barrier()
 
 
-def apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE):
+def apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype):
     num_wires = numQubits
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     commSize = comm.Get_size()
 
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
-    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     def circuit():
@@ -102,8 +107,10 @@ def circuit():
 
     comm.Barrier()
 
+    r_dtype = np.float64 if c_dtype == np.complex128 else np.float32
+
     if rank == 0:
-        probs_mpi = np.zeros(1 << len(Wires)).astype(dev_mpi.R_DTYPE)
+        probs_mpi = np.zeros(2 ** len(Wires)).astype(r_dtype)
     else:
         probs_mpi = None
         probs_cpu = None
@@ -116,6 +123,19 @@ def circuit():
     comm.Barrier()
 
 
+@pytest.mark.parametrize(
+    "Wires",
+    [
+        [0],
+        [1],
+        [0, 1],
+        [0, 2],
+        [0, numQubits - 1],
+        [numQubits - 2, numQubits - 1],
+        range(numQubits),
+    ],
+)
+@pytest.mark.parametrize("c_dtype", [np.complex128])
 class TestProbs:
     """Tests for the probability method."""
 
@@ -123,41 +143,15 @@ class TestProbs:
         "operation", [qml.PauliX, qml.PauliY, qml.PauliZ, qml.Hadamard, qml.S, qml.T]
     )
     @pytest.mark.parametrize("GateWires", [[0], [numQubits - 1]])
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_single_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
-        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+    def test_prob_single_wire_nonparam(self, tol, operation, GateWires, Wires, c_dtype):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.CNOT, qml.SWAP, qml.CY, qml.CZ])
     @pytest.mark.parametrize(
         "GateWires", [[0, 1], [numQubits - 2, numQubits - 1], [0, numQubits - 1]]
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_two_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
-        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+    def test_prob_two_wire_nonparam(self, tol, operation, GateWires, Wires, c_dtype):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -169,80 +163,28 @@ def test_prob_two_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE)
             [0, numQubits - 2, numQubits - 1],
         ],
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_three_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
-        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+    def test_prob_three_wire_nonparam(self, tol, operation, GateWires, Wires, c_dtype):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.PhaseShift, qml.RX, qml.RY, qml.RZ])
     @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
     @pytest.mark.parametrize("GateWires", [0, numQubits - 1])
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_single_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_single_wire_param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.Rot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
     @pytest.mark.parametrize("GateWires", [0, numQubits - 1])
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_single_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_single_wire_3param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize("operation", [qml.CRot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
     @pytest.mark.parametrize(
         "GateWires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]]
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_two_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_two_wire_3param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize(
         "operation",
@@ -263,21 +205,8 @@ def test_prob_two_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTY
     @pytest.mark.parametrize(
         "GateWires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]]
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_two_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_two_wire_param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
 
     @pytest.mark.parametrize(
         "operation",
@@ -292,18 +221,5 @@ def test_prob_two_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYP
             [numQubits - 4, numQubits - 3, numQubits - 2, numQubits - 1],
         ],
     )
-    @pytest.mark.parametrize(
-        "Wires",
-        [
-            [0],
-            [1],
-            [0, 1],
-            [0, 2],
-            [0, numQubits - 1],
-            [numQubits - 2, numQubits - 1],
-            range(numQubits),
-        ],
-    )
-    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
-    def test_prob_four_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
-        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+    def test_prob_four_wire_param(self, tol, operation, par, GateWires, Wires, c_dtype):
+        apply_probs_param(tol, operation, par, GateWires, Wires, c_dtype)
diff --git a/pennylane_lightning/core/_adjoint_jacobian_base.py b/pennylane_lightning/core/_adjoint_jacobian_base.py
index 50046d5f9..a779c0cc4 100644
--- a/pennylane_lightning/core/_adjoint_jacobian_base.py
+++ b/pennylane_lightning/core/_adjoint_jacobian_base.py
@@ -111,6 +111,7 @@ def _process_jacobian_tape(self, tape: QuantumTape, split_obs: bool = False):
             self._qubit_state.device_name, use_csingle, use_mpi, split_obs
         ).serialize_ops(tape)
 
+        # pylint: disable=not-callable
         ops_serialized = self._create_ops_list_lightning(*ops_serialized)
 
         # We need to filter out indices in trainable_params which do not
diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index 06ae87889..badbe7ec4 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -56,6 +56,7 @@ def __init__(
     ) -> None:
         self._qubit_state = qubit_state
 
+        self._use_mpi = False
         # Dummy for the C++ bindings
         self._measurement_lightning = None
 
@@ -94,7 +95,6 @@ def state_diagonalizing_gates(self, measurementprocess: StateMeasurement) -> Ten
         self._qubit_state.apply_operations([qml.adjoint(g) for g in reversed(diagonalizing_gates)])
         return result
 
-    # pylint: disable=protected-access
     def expval(self, measurementprocess: MeasurementProcess):
         """Expectation value of the supplied observable contained in the MeasurementProcess.
 
@@ -121,8 +121,9 @@ def expval(self, measurementprocess: MeasurementProcess):
             or (measurementprocess.obs.arithmetic_depth > 0)
             or isinstance(measurementprocess.obs.name, List)
         ):
+            # pylint: disable=protected-access
             ob_serialized = QuantumScriptSerializer(
-                self._qubit_state.device_name, self.dtype == np.complex64
+                self._qubit_state.device_name, self.dtype == np.complex64, self._use_mpi
             )._ob(measurementprocess.obs)
             return self._measurement_lightning.expval(ob_serialized)
 
@@ -130,24 +131,37 @@ def expval(self, measurementprocess: MeasurementProcess):
             measurementprocess.obs.name, measurementprocess.obs.wires
         )
 
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+        Args:
+            probs_result (Any): Result provided by C++ backend.
+        Returns:
+            np.ndarray with probabilities of the supplied observable or wires.
+        """
+        return probs_results
+
     def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
         Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state
+            measurementprocess (StateMeasurement): measurement to apply to the state.
 
         Returns:
-            Probabilities of the supplied observable or wires
+            Probabilities of the supplied observable or wires.
         """
         diagonalizing_gates = measurementprocess.diagonalizing_gates()
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(diagonalizing_gates)
+
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
-        return results
+
+        return self._probs_retval_conversion(results)
 
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess.
@@ -175,8 +189,9 @@ def var(self, measurementprocess: MeasurementProcess):
             or (measurementprocess.obs.arithmetic_depth > 0)
             or isinstance(measurementprocess.obs.name, List)
         ):
+            # pylint: disable=protected-access
             ob_serialized = QuantumScriptSerializer(
-                self._qubit_state.device_name, self.dtype == np.complex64
+                self._qubit_state.device_name, self.dtype == np.complex64, self._use_mpi
             )._ob(measurementprocess.obs)
             return self._measurement_lightning.var(ob_serialized)
 
@@ -187,6 +202,7 @@ def var(self, measurementprocess: MeasurementProcess):
     def get_measurement_function(
         self, measurementprocess: MeasurementProcess
     ) -> Callable[[MeasurementProcess, TensorLike], TensorLike]:
+        # pylint: disable=too-many-return-statements
         """Get the appropriate method for performing a measurement.
 
         Args:
@@ -197,16 +213,24 @@ def get_measurement_function(
         """
         if isinstance(measurementprocess, StateMeasurement):
             if isinstance(measurementprocess, ExpectationMP):
-                if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
-                    return self.state_diagonalizing_gates
+                if self._use_mpi:
+                    if isinstance(measurementprocess.obs, (qml.Projector)):
+                        return self.state_diagonalizing_gates
+                else:
+                    if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
+                        return self.state_diagonalizing_gates
                 return self.expval
 
             if isinstance(measurementprocess, ProbabilityMP):
                 return self.probs
 
             if isinstance(measurementprocess, VarianceMP):
-                if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
-                    return self.state_diagonalizing_gates
+                if self._use_mpi:
+                    if isinstance(measurementprocess.obs, (qml.Projector)):
+                        return self.state_diagonalizing_gates
+                else:
+                    if isinstance(measurementprocess.obs, (qml.Identity, qml.Projector)):
+                        return self.state_diagonalizing_gates
                 return self.var
             if measurementprocess.obs is None or measurementprocess.obs.has_diagonalizing_gates:
                 return self.state_diagonalizing_gates
diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index 3e08a5ab4..8815e13a0 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -16,7 +16,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 from pennylane import BasisState, StatePrep
@@ -101,7 +101,7 @@ def reset_state(self):
         self._qubit_state.resetStateVector()
 
     @abstractmethod
-    def _apply_state_vector(self, state, device_wires: Wires):
+    def _apply_state_vector(self, state, device_wires: Wires, sync: Optional[bool] = None):
         """Initialize the internal state vector in a specified state.
         Args:
             state (array[complex]): normalized input state of length ``2**len(wires)``
@@ -117,6 +117,7 @@ def _apply_basis_state(self, state, wires):
                 consisting of 0s and 1s.
             wires (Wires): wires that the provided computational state should be
                 initialized on
+            use_async(Optional[bool]): immediately sync with host-sv after applying operation.
 
         Note: This function does not support broadcasted inputs yet.
         """
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index cb8540718..9b3cc2f16 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev35"
+__version__ = "0.39.0-dev36"
diff --git a/pennylane_lightning/core/lightning_newAPI_base.py b/pennylane_lightning/core/lightning_newAPI_base.py
index dcee73fd5..12cdf98b4 100644
--- a/pennylane_lightning/core/lightning_newAPI_base.py
+++ b/pennylane_lightning/core/lightning_newAPI_base.py
@@ -90,6 +90,49 @@ def c_dtype(self):
     def _set_lightning_classes(self):
         """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
 
+    @abstractmethod
+    def _setup_execution_config(self, config):
+        """
+        Update the execution config with choices for how the device should be used and the device options.
+        """
+
+    @abstractmethod
+    def preprocess(self, execution_config: ExecutionConfig = DefaultExecutionConfig):
+        """This function defines the device transform program to be applied and an updated device configuration.
+
+        Args:
+            execution_config (Union[ExecutionConfig, Sequence[ExecutionConfig]]): A data structure describing the
+                parameters needed to fully describe the execution.
+
+        Returns:
+            TransformProgram, ExecutionConfig: A transform program that when called returns :class:`~.QuantumTape`'s that the
+            device can natively execute as well as a postprocessing function to be called after execution, and a configuration
+            with unset specifications filled in.
+
+        This device:
+
+        * Supports any qubit operations that provide a matrix
+        * Currently does not support finite shots
+        * Currently does not intrinsically support parameter broadcasting
+
+        """
+
+    @abstractmethod
+    def execute(
+        self,
+        circuits: QuantumTape_or_Batch,
+        execution_config: ExecutionConfig = DefaultExecutionConfig,
+    ) -> Result_or_ResultBatch:
+        """Execute a circuit or a batch of circuits and turn it into results.
+
+        Args:
+            circuits (Union[QuantumTape, Sequence[QuantumTape]]): the quantum circuits to be executed
+            execution_config (ExecutionConfig): a datastructure with additional information required for execution
+
+        Returns:
+            TensorLike, tuple[TensorLike], tuple[tuple[TensorLike]]: A numeric result of the computation.
+        """
+
     @abstractmethod
     def simulate(
         self,
@@ -112,6 +155,25 @@ def simulate(
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
 
+    @abstractmethod
+    def supports_derivatives(
+        self,
+        execution_config: Optional[ExecutionConfig] = None,
+        circuit: Optional[qml.tape.QuantumTape] = None,
+    ) -> bool:
+        """Check whether or not derivatives are available for a given configuration and circuit.
+
+        ``LightningGPU`` supports adjoint differentiation with analytic results.
+
+        Args:
+            execution_config (ExecutionConfig): The configuration of the desired derivative calculation
+            circuit (QuantumTape): An optional circuit to check derivatives support for.
+
+        Returns:
+            Bool: Whether or not a derivative can be calculated provided the given information
+
+        """
+
     def jacobian(
         self,
         circuit: QuantumTape,
@@ -135,6 +197,7 @@ def jacobian(
             [circuit], _ = qml.map_wires(circuit, wire_map)
         state.reset_state()
         final_state = state.get_final_state(circuit)
+        # pylint: disable=not-callable
         return self.LightningAdjointJacobian(final_state, batch_obs=batch_obs).calculate_jacobian(
             circuit
         )
@@ -163,6 +226,7 @@ def simulate_and_jacobian(
         if wire_map is not None:
             [circuit], _ = qml.map_wires(circuit, wire_map)
         res = self.simulate(circuit, state)
+        # pylint: disable=not-callable
         jac = self.LightningAdjointJacobian(state, batch_obs=batch_obs).calculate_jacobian(circuit)
         return res, jac
 
@@ -193,6 +257,7 @@ def vjp(  # pylint: disable=too-many-arguments
             [circuit], _ = qml.map_wires(circuit, wire_map)
         state.reset_state()
         final_state = state.get_final_state(circuit)
+        # pylint: disable=not-callable
         return self.LightningAdjointJacobian(final_state, batch_obs=batch_obs).calculate_vjp(
             circuit, cotangents
         )
@@ -224,6 +289,7 @@ def simulate_and_vjp(  # pylint: disable=too-many-arguments
         if wire_map is not None:
             [circuit], _ = qml.map_wires(circuit, wire_map)
         res = self.simulate(circuit, state)
+        # pylint: disable=not-callable
         _vjp = self.LightningAdjointJacobian(state, batch_obs=batch_obs).calculate_vjp(
             circuit, cotangents
         )
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 3753f792f..a98b51df6 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -260,13 +260,11 @@ class StateVectorCudaMPI final
      */
     void setBasisState(const std::complex<Precision> &value,
                        const std::size_t index, const bool async = false) {
-        std::size_t rankId = index >> BaseType::getNumQubits();
+        const std::size_t rankId = index >> this->getNumLocalQubits();
+
+        const std::size_t local_index =
+            compute_local_index(index, this->getNumLocalQubits());
 
-        std::size_t local_index =
-            static_cast<std::size_t>(
-                rankId * std::pow(2.0, static_cast<long double>(
-                                           BaseType::getNumQubits()))) ^
-            index;
         BaseType::getDataBuffer().zeroInit();
 
         CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
@@ -280,6 +278,45 @@ class StateVectorCudaMPI final
         mpi_manager_.Barrier();
     }
 
+    /**
+     * @brief Prepare a single computational basis state.
+     *
+     * @param state Binary number representing the index
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy.
+     */
+    void setBasisState(const std::vector<std::size_t> &state,
+                       const std::vector<std::size_t> &wires,
+                       const bool use_async) {
+        PL_ABORT_IF_NOT(state.size() == wires.size(),
+                        "state and wires must have equal dimensions.");
+
+        const auto n_wires = this->getTotalNumQubits();
+
+        std::size_t index{0U};
+        for (std::size_t k = 0; k < n_wires; k++) {
+            const auto bit = state[k];
+            index |= bit << (n_wires - 1 - wires[k]);
+        }
+
+        const std::size_t rankId = index >> this->getNumLocalQubits();
+        const std::size_t local_index =
+            compute_local_index(index, this->getNumLocalQubits());
+
+        const std::complex<PrecisionT> value(1.0, 0.0);
+        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
+
+        BaseType::getDataBuffer().zeroInit();
+
+        auto stream_id = localStream_.get();
+
+        if (mpi_manager_.getRank() == rankId) {
+            setBasisState_CUDA(BaseType::getData(), value_cu, local_index,
+                               use_async, stream_id);
+        }
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
+    }
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
@@ -307,11 +344,9 @@ class StateVectorCudaMPI final
                 static_cast<std::size_t>(index) >> BaseType::getNumQubits();
 
             if (rankId == mpi_manager_.getRank()) {
-                int local_index =
-                    static_cast<std::size_t>(
-                        rankId * std::pow(2.0, static_cast<long double>(
-                                                   BaseType::getNumQubits()))) ^
-                    index;
+                int local_index = static_cast<int>(
+                    compute_local_index(static_cast<std::size_t>(index),
+                                        this->getNumLocalQubits()));
                 indices_local.push_back(local_index);
                 values_local.push_back(values[i]);
             }
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 716d95c89..b33c16471 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -184,6 +184,42 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
+    /**
+     * @brief Prepare a single computational basis state.
+     *
+     * @param state Binary number representing the index
+     * @param wires Wires.
+     * @param use_async(Optional[bool]): immediately sync with host-sv after
+     applying operation.
+
+     */
+    void setBasisState(const std::vector<std::size_t> &state,
+                       const std::vector<std::size_t> &wires,
+                       const bool use_async = false) {
+        PL_ABORT_IF_NOT(state.size() == wires.size(),
+                        "state and wires must have equal dimensions.");
+        const auto num_qubits = BaseType::getNumQubits();
+        PL_ABORT_IF_NOT(
+            std::find_if(wires.begin(), wires.end(),
+                         [&num_qubits](const auto i) {
+                             return i >= num_qubits;
+                         }) == wires.end(),
+            "wires must take values lower than the number of qubits.");
+        const auto n_wires = wires.size();
+        std::size_t index{0U};
+        for (std::size_t k = 0; k < n_wires; k++) {
+            const auto bit = static_cast<std::size_t>(state[k]);
+            index |= bit << (num_qubits - 1 - wires[k]);
+        }
+
+        BaseType::getDataBuffer().zeroInit();
+        const std::complex<PrecisionT> value(1.0, 0.0);
+        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+        setBasisState_CUDA(BaseType::getData(), value_cu, index, use_async,
+                           stream_id);
+    }
+
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index 5bd92b552..3874d3451 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -83,12 +83,12 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
         }))
         .def(
             "setBasisState",
-            [](StateVectorT &sv, const std::size_t index,
-               const bool use_async) {
-                const std::complex<PrecisionT> value(1, 0);
-                sv.setBasisState(value, index, use_async);
-            },
-            "Create Basis State on GPU.")
+            [](StateVectorT &sv, const std::vector<std::size_t> &state,
+               const std::vector<std::size_t> &wires,
+               const bool async) { sv.setBasisState(state, wires, async); },
+            py::arg("state") = nullptr, py::arg("wires") = nullptr,
+            py::arg("async") = false,
+            "Set the state vector to a basis state on GPU.")
         .def(
             "setStateVector",
             [](StateVectorT &sv, const np_arr_sparse_ind &indices,
@@ -152,7 +152,11 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def(
+            "resetStateVector",
+            [](StateVectorT &gpu_sv, bool async) { gpu_sv.initSV(async); },
+            py::arg("async") = false,
+            "Initialize the statevector data to the |0...0> state")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 620fd9386..83a47ed34 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -86,6 +86,15 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             })) // qubits, device
         .def(
             "setBasisState",
+            [](StateVectorT &sv, const std::vector<std::size_t> &state,
+               const std::vector<std::size_t> &wires, const bool use_async) {
+                sv.setBasisState(state, wires, use_async);
+            },
+            py::arg("state") = nullptr, py::arg("wires") = nullptr,
+            py::arg("async") = false,
+            "Set the state vector to a basis state on GPU.")
+        .def(
+            "setBasisStateIndex",
             [](StateVectorT &sv, const std::size_t index,
                const bool use_async) {
                 const std::complex<PrecisionT> value(1, 0);
@@ -155,7 +164,11 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def(
+            "resetStateVector",
+            [](StateVectorT &gpu_sv, bool async) { gpu_sv.initSV(async); },
+            py::arg("async") = false,
+            "Initialize the statevector data to the |0...0> state")
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
index 460a4fa8c..fe19b5d02 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
@@ -273,7 +273,7 @@ class Measurements final
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerSample(
             this->_statevector.getCusvHandle(), sampler, bitStrings.data(),
             bitOrdering.data(), bitStringLen, rand_nums.data(), num_samples,
-            CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER));
+            CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
         PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
             this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
index 6dd5a0159..4b5a2dd34 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
@@ -36,6 +36,7 @@
 namespace {
 using namespace Pennylane::LightningGPU;
 using namespace Pennylane::LightningGPU::MPI;
+using namespace Pennylane::LightningGPU::Util;
 using namespace Pennylane::Util;
 
 using Pennylane::Util::isApproxEqual;
@@ -52,6 +53,23 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Constructibility",
     }
 }
 
+TEMPLATE_TEST_CASE("cuStateVec_helper::compute_local_index",
+                   "[Default Constructibility]", StateVectorCudaMPI<>) {
+    const std::size_t local_num_qubits = 4;
+
+    SECTION("compute_local_index, index inside the current qubits set") {
+        const std::size_t index = 2; // 0b00010
+        std::size_t local_index = compute_local_index(index, local_num_qubits);
+        REQUIRE(local_index == index);
+    }
+
+    SECTION("compute_local_index, index outside the current qubits set") {
+        const std::size_t index = 16; // 0b10000
+        std::size_t local_index = compute_local_index(index, local_num_qubits);
+        REQUIRE(local_index == 0);
+    }
+}
+
 TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::Constructibility",
                            "[General Constructibility]", (StateVectorCudaMPI),
                            (float, double)) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
index 8bd27c2dc..ffdefe3e2 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
@@ -101,4 +101,22 @@ inline SharedCusvHandle make_shared_cusv_handle() {
     PL_CUSTATEVEC_IS_SUCCESS(custatevecCreate(&h));
     return {h, handleDeleter()};
 }
+
+/**
+ * @brief Compute the local index from a given index in multi-gpu workflow
+ *
+ * @param index Global index of the target element.
+ * @param num_qubits Number of wires within the local devices.
+ *
+ *  @return local_index Local index of the target element.
+ */
+inline std::size_t compute_local_index(const std::size_t index,
+                                       const std::size_t num_qubits) {
+    // TODO: bound check for the left shift operation here
+    constexpr std::size_t one{1U};
+    const std::size_t local_index =
+        (index >> num_qubits) * (one << num_qubits) ^ index;
+    return local_index;
+}
+
 } // namespace Pennylane::LightningGPU::Util
diff --git a/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
new file mode 100644
index 000000000..50f9acef3
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_adjoint_jacobian.py
@@ -0,0 +1,248 @@
+# Copyright 2018-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""
+Internal methods for adjoint Jacobian differentiation method.
+"""
+
+from __future__ import annotations
+
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import DevPool
+    from pennylane_lightning.lightning_gpu_ops.algorithms import (
+        AdjointJacobianC64,
+        AdjointJacobianC128,
+        create_ops_listC64,
+        create_ops_listC128,
+    )
+
+    try:
+        from pennylane_lightning.lightning_gpu_ops.algorithmsMPI import (
+            AdjointJacobianMPIC64,
+            AdjointJacobianMPIC128,
+            create_ops_listMPIC64,
+            create_ops_listMPIC128,
+        )
+
+        mpi_error = None
+        MPI_SUPPORT = True
+    except ImportError as ex_mpi:
+        mpi_error = ex_mpi
+        MPI_SUPPORT = False
+
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+
+import numpy as np
+from pennylane import BasisState, StatePrep
+from pennylane.operation import Operation
+from pennylane.tape import QuantumTape
+from scipy.sparse import csr_matrix
+
+# pylint: disable=ungrouped-imports
+from pennylane_lightning.core._adjoint_jacobian_base import LightningBaseAdjointJacobian
+from pennylane_lightning.core._serialize import QuantumScriptSerializer
+
+
+class LightningGPUAdjointJacobian(LightningBaseAdjointJacobian):
+    """Check and execute the adjoint Jacobian differentiation method.
+
+    Args:
+        qubit_state(LightningGPUStateVector): State Vector to calculate the adjoint Jacobian with.
+        batch_obs(bool): If serialized tape is to be batched or not.
+            For Lightning GPU, distribute the observations across GPUs in the same node. Defaults to False.
+            For Lightning GPU-MPI, if `batch_obs=False` the computation requires more memory and is faster,
+            while `batch_obs=True` allows a larger number of qubits simulation
+            at the expense of high computational cost. Defaults to False.
+    """
+
+    # pylint: disable=too-few-public-methods
+
+    def __init__(
+        self,
+        qubit_state: LightningGPUStateVector,  # pylint: disable=undefined-variable
+        batch_obs: bool = False,
+    ) -> None:
+
+        super().__init__(qubit_state, batch_obs)
+
+        self._dp = DevPool()
+
+        self._use_mpi = qubit_state._mpi_handler.use_mpi
+
+        if self._use_mpi:
+            self._mpi_handler = qubit_state._mpi_handler
+
+        # Initialize the C++ binds
+        self._jacobian_lightning, self._create_ops_list_lightning = self._adjoint_jacobian_dtype()
+
+        # Warning about performance with MPI and batch observation
+        if self._use_mpi and not self._batch_obs:
+            warn(
+                "Using LightningGPU with `batch_obs=False` and `use_mpi=True` has the limitation of requiring more memory. If you want to allocate larger number of qubits use the option `batch_obs=True`"
+                "For more information Check out the section `Parallel adjoint differentiation support` in our website https://docs.pennylane.ai/projects/lightning/en/stable/lightning_gpu/device.html for more details.",
+                RuntimeWarning,
+            )
+
+    def _adjoint_jacobian_dtype(self):
+        """Binding to Lightning GPU Adjoint Jacobian C++ class.
+
+        Returns: the AdjointJacobian class
+        """
+        if self._use_mpi:
+            if not MPI_SUPPORT:
+                warn(str(mpi_error), UserWarning)
+
+            jacobian_lightning = (
+                AdjointJacobianMPIC64() if self.dtype == np.complex64 else AdjointJacobianMPIC128()
+            )
+            create_ops_list_lightning = (
+                create_ops_listMPIC64 if self.dtype == np.complex64 else create_ops_listMPIC128
+            )
+            return jacobian_lightning, create_ops_list_lightning
+
+        # without MPI
+        jacobian_lightning = (
+            AdjointJacobianC64() if self.dtype == np.complex64 else AdjointJacobianC128()
+        )
+        create_ops_list_lightning = (
+            create_ops_listC64 if self.dtype == np.complex64 else create_ops_listC128
+        )
+        return jacobian_lightning, create_ops_list_lightning
+
+    def _process_jacobian_tape(
+        self, tape: QuantumTape, split_obs: bool = False, use_mpi: bool = False
+    ):
+        """Process a tape, serializing and building a dictionary proper for
+        the adjoint Jacobian calculation in the C++ layer.
+
+        Args:
+            tape (QuantumTape): Operations and measurements that represent instructions for execution on Lightning.
+            split_obs (bool, optional): If splitting the observables in a list. Defaults to False.
+            use_mpi (bool, optional): If distributing computation with MPI. Defaults to False.
+
+        Returns:
+            dictionary: dictionary providing serialized data for Jacobian calculation.
+        """
+        use_csingle = self._qubit_state.dtype == np.complex64
+
+        obs_serialized, obs_indices = QuantumScriptSerializer(
+            self._qubit_state.device_name, use_csingle, use_mpi, split_obs
+        ).serialize_observables(tape)
+
+        ops_serialized, use_sp = QuantumScriptSerializer(
+            self._qubit_state.device_name, use_csingle, use_mpi, split_obs
+        ).serialize_ops(tape)
+
+        ops_serialized = self._create_ops_list_lightning(*ops_serialized)
+
+        # We need to filter out indices in trainable_params which do not
+        # correspond to operators.
+        trainable_params = sorted(tape.trainable_params)
+        if len(trainable_params) == 0:
+            return None
+
+        tp_shift = []
+        record_tp_rows = []
+        all_params = 0
+
+        for op_idx, trainable_param in enumerate(trainable_params):
+            # get op_idx-th operator among differentiable operators
+            operation, _, _ = tape.get_operation(op_idx)
+            if isinstance(operation, Operation) and not isinstance(
+                operation, (BasisState, StatePrep)
+            ):
+                # We now just ignore non-op or state preps
+                tp_shift.append(trainable_param)
+                record_tp_rows.append(all_params)
+            all_params += 1
+
+        if use_sp:
+            # When the first element of the tape is state preparation. Still, I am not sure
+            # whether there must be only one state preparation...
+            tp_shift = [i - 1 for i in tp_shift]
+
+        return {
+            "state_vector": self.state,
+            "obs_serialized": obs_serialized,
+            "ops_serialized": ops_serialized,
+            "tp_shift": tp_shift,
+            "record_tp_rows": record_tp_rows,
+            "all_params": all_params,
+            "obs_indices": obs_indices,
+        }
+
+    def calculate_jacobian(self, tape: QuantumTape):
+        """Computes the Jacobian with the adjoint method.
+
+        .. code-block:: python
+
+            statevector = LightningGPUStateVector(num_wires=num_wires)
+            statevector = statevector.get_final_state(tape)
+            jacobian = LightningGPUAdjointJacobian(statevector).calculate_jacobian(tape)
+
+        Args:
+            tape (QuantumTape): Operations and measurements that represent instructions for execution on Lightning.
+
+        Returns:
+            The Jacobian of a tape.
+        """
+
+        empty_array = self._handle_raises(tape, is_jacobian=True)
+
+        if empty_array:
+            return np.array([], dtype=self.dtype)
+
+        if self._use_mpi:
+            split_obs = False  # with MPI batched means compute Jacobian one observables at a time, no point splitting linear combinations
+        else:
+            split_obs = self._dp.getTotalDevices() if self._batch_obs else False
+
+        processed_data = self._process_jacobian_tape(tape, split_obs, self._use_mpi)
+
+        if not processed_data:  # training_params is empty
+            return np.array([], dtype=self.dtype)
+
+        trainable_params = processed_data["tp_shift"]
+
+        if self._batch_obs:  # Batching of Measurements
+            jac = self._jacobian_lightning.batched(
+                processed_data["state_vector"],
+                processed_data["obs_serialized"],
+                processed_data["ops_serialized"],
+                trainable_params,
+            )
+        else:
+            jac = self._jacobian_lightning(
+                processed_data["state_vector"],
+                processed_data["obs_serialized"],
+                processed_data["ops_serialized"],
+                trainable_params,
+            )
+
+        jac = np.array(jac)
+        has_shape0 = bool(len(jac))
+
+        num_obs = len(np.unique(processed_data["obs_indices"]))
+        rows = processed_data["obs_indices"]
+        cols = np.arange(len(rows), dtype=int)
+        data = np.ones(len(rows))
+        red_mat = csr_matrix((data, (rows, cols)), shape=(num_obs, len(rows)))
+        jac = red_mat @ jac.reshape((len(rows), -1))
+        jac = jac.reshape(-1, len(trainable_params)) if has_shape0 else jac
+        jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
+        jac_r[:, processed_data["record_tp_rows"]] = jac
+        return self._adjoint_jacobian_processing(jac_r)
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
new file mode 100644
index 000000000..9efd2c19a
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -0,0 +1,219 @@
+# Copyright 2018-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Class implementation for state vector measurements.
+"""
+
+from __future__ import annotations
+
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import MeasurementsC64, MeasurementsC128
+
+    try:
+        from pennylane_lightning.lightning_gpu_ops import MeasurementsMPIC64, MeasurementsMPIC128
+
+        mpi_error = None
+        MPI_SUPPORT = True
+    except ImportError as ex_mpi:
+        mpi_error = ex_mpi
+        MPI_SUPPORT = False
+
+except ImportError as error_import:
+    warn(str(error_import), UserWarning)
+
+from typing import Any, List
+
+import numpy as np
+import pennylane as qml
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
+from pennylane.typing import TensorLike
+
+# pylint: disable=ungrouped-imports
+from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
+from pennylane_lightning.core._serialize import QuantumScriptSerializer
+
+
+class LightningGPUMeasurements(LightningBaseMeasurements):  # pylint: disable=too-few-public-methods
+    """Lightning GPU Measurements class
+
+    Measures the state provided by the LightningGPUStateVector class.
+
+    Args:
+        qubit_state(LightningGPUStateVector): Lightning state-vector class containing the state vector to be measured.
+    """
+
+    def __init__(
+        self,
+        qubit_state: LightningGPUStateVector,  # pylint: disable=undefined-variable
+    ) -> TensorLike:
+
+        super().__init__(qubit_state)
+
+        self._use_mpi = qubit_state._mpi_handler.use_mpi
+
+        if self._use_mpi:
+            self._mpi_handler = qubit_state._mpi_handler
+            self._num_local_wires = qubit_state._mpi_handler.num_local_wires
+
+        self._measurement_lightning = self._measurement_dtype()(qubit_state.state_vector)
+
+    def _measurement_dtype(self):
+        """Binding to Lightning GPU Measurements C++ class.
+
+        Returns: the Measurements class
+        """
+        if self._use_mpi:
+            if not MPI_SUPPORT:
+                warn(str(mpi_error), UserWarning)
+
+            return MeasurementsMPIC128 if self.dtype == np.complex128 else MeasurementsMPIC64
+
+        # without MPI
+        return MeasurementsC128 if self.dtype == np.complex128 else MeasurementsC64
+
+    def _measure_with_samples_diagonalizing_gates(
+        self,
+        mps: List[SampleMeasurement],
+        shots: Shots,
+    ) -> TensorLike:
+        """
+        Returns the samples of the measurement process performed on the given state,
+        by rotating the state into the measurement basis using the diagonalizing gates
+        given by the measurement process.
+
+        Args:
+            mps (~.measurements.SampleMeasurement): The sample measurements to perform
+            shots (~.measurements.Shots): The number of samples to take
+
+        Returns:
+            TensorLike[Any]: Sample measurement results
+        """
+        # apply diagonalizing gates
+        self._apply_diagonalizing_gates(mps)
+
+        # Specific for LGPU:
+        total_indices = self._qubit_state.num_wires
+        wires = qml.wires.Wires(range(total_indices))
+
+        def _process_single_shot(samples):
+            processed = []
+            for mp in mps:
+                res = mp.process_samples(samples, wires)
+                if not isinstance(mp, CountsMP):
+                    res = qml.math.squeeze(res)
+
+                processed.append(res)
+
+            return tuple(processed)
+
+        try:
+            samples = self._measurement_lightning.generate_samples(
+                len(wires), shots.total_shots
+            ).astype(int, copy=False)
+
+        except ValueError as ex:
+            if str(ex) != "probabilities contain NaN":
+                raise ex
+            samples = qml.math.full((shots.total_shots, len(wires)), 0)
+
+        self._apply_diagonalizing_gates(mps, adjoint=True)
+
+        # if there is a shot vector, use the shots.bins generator to
+        # split samples w.r.t. the shots
+        processed_samples = []
+        for lower, upper in shots.bins():
+            result = _process_single_shot(samples[..., lower:upper, :])
+            processed_samples.append(result)
+
+        return (
+            tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
+        )
+
+    def expval(self, measurementprocess: MeasurementProcess):
+        """Expectation value of the supplied observable contained in the MeasurementProcess.
+
+        Args:
+            measurementprocess (StateMeasurement): measurement to apply to the state
+
+        Returns:
+            Expectation value of the observable
+        """
+
+        if isinstance(measurementprocess.obs, qml.SparseHamiltonian):
+            # ensuring CSR sparse representation.
+
+            if self._use_mpi:
+                # Identity for CSR_SparseHamiltonian to pass to processes with rank != 0 to reduce
+                # host(cpu) memory requirements
+                obs = qml.Identity(0)
+                Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
+                H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
+                CSR_SparseHamiltonian = H_sparse.sparse_matrix().tocsr()
+                # CSR_SparseHamiltonian for rank == 0
+                if self._mpi_handler.mpi_manager.getRank() == 0:
+                    CSR_SparseHamiltonian = measurementprocess.obs.sparse_matrix().tocsr()
+            else:
+                CSR_SparseHamiltonian = measurementprocess.obs.sparse_matrix(
+                    wire_order=list(range(self._qubit_state.num_wires))
+                ).tocsr(copy=False)
+
+            return self._measurement_lightning.expval(
+                CSR_SparseHamiltonian.indptr,
+                CSR_SparseHamiltonian.indices,
+                CSR_SparseHamiltonian.data,
+            )
+
+        # use specialized functors to compute expval(Hermitian)
+        if isinstance(measurementprocess.obs, qml.Hermitian):
+            observable_wires = measurementprocess.obs.wires
+            if self._use_mpi and len(observable_wires) > self._num_local_wires:
+                raise RuntimeError(
+                    "MPI backend does not support Hermitian with number of target wires larger than local wire number."
+                )
+            matrix = measurementprocess.obs.matrix()
+            return self._measurement_lightning.expval(matrix, observable_wires)
+
+        if (
+            isinstance(measurementprocess.obs, qml.ops.Hamiltonian)
+            or (measurementprocess.obs.arithmetic_depth > 0)
+            or isinstance(measurementprocess.obs.name, List)
+        ):
+            # pylint: disable=protected-access
+            ob_serialized = QuantumScriptSerializer(
+                self._qubit_state.device_name, self.dtype == np.complex64, self._use_mpi
+            )._ob(measurementprocess.obs)
+            return self._measurement_lightning.expval(ob_serialized)
+
+        return self._measurement_lightning.expval(
+            measurementprocess.obs.name, measurementprocess.obs.wires
+        )
+
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+
+        Args:
+            probs_result (Any): Result provided by C++ backend.
+
+        Returns:
+            np.ndarray with probabilities of the supplied observable or wires.
+        """
+
+        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
+        if len(probs_results) > 0:
+            num_local_wires = len(probs_results).bit_length() - 1 if len(probs_results) > 0 else 0
+            return probs_results.reshape([2] * num_local_wires).transpose().reshape(-1)
+
+        return probs_results
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
new file mode 100644
index 000000000..0d569ebeb
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains the :class:`~.LightningGPU_MPIHandler` class, a MPI handler to use LightningGPU device with multi-GPU on multi-node system.
+"""
+
+try:
+    # pylint: disable=no-name-in-module
+    from pennylane_lightning.lightning_gpu_ops import DevPool, DevTag, MPIManager
+
+    MPI_SUPPORT = True
+except ImportError:
+    MPI_SUPPORT = False
+
+from typing import Union
+
+import numpy as np
+
+
+# MPI options
+class MPIHandler:  # pylint: disable=too-few-public-methods
+    """MPI handler for PennyLane Lightning GPU device.
+
+    MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
+
+    Use the MPI library is necessary to initialize different variables and methods to handle the data across nodes and perform checks for memory allocation on each device.
+
+    Args:
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        num_wires (int): the number of wires to initialize the device with.
+        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation.
+    """
+
+    def __init__(
+        self,
+        mpi: bool,
+        mpi_buf_size: int,
+        num_wires: int,
+        c_dtype: Union[np.complex64, np.complex128],
+    ) -> None:
+
+        self.use_mpi = mpi
+        self.mpi_buf_size = mpi_buf_size
+
+        self._dp = DevPool()
+
+        if self.use_mpi:
+
+            if not MPI_SUPPORT:
+                raise ImportError(
+                    "Pre-compiled binaries for lightning.gpu with MPI support are not available. "
+                    "To manually compile from source, follow the instructions at "
+                    "https://docs.pennylane.ai/projects/lightning/en/stable/dev/installation.html."
+                )
+
+            if mpi_buf_size < 0:
+                raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
+
+            if mpi_buf_size > 0 and (mpi_buf_size & (mpi_buf_size - 1)):
+                raise ValueError(
+                    f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
+                )
+
+            # After check if all MPI parameters are ok
+            self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
+
+            # set the number of global and local wires
+            commSize = self.mpi_manager.getSize()
+            self.num_global_wires = commSize.bit_length() - 1
+            self.num_local_wires = num_wires - self.num_global_wires
+
+            self._check_memory_size(c_dtype, mpi_buf_size)
+
+        if not self.use_mpi:
+            self.num_local_wires = num_wires
+            self.num_global_wires = num_wires
+
+    def _mebibytesToBytes(self, mebibytes):
+        return mebibytes * 1024 * 1024
+
+    def _check_memory_size(self, c_dtype, mpi_buf_size):
+        # Memory size in bytes
+        sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
+        if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
+            raise RuntimeError("The MPI buffer size is larger than the local state vector size.")
+
+    def _mpi_init_helper(self, num_wires):
+        """Set up MPI checks and initializations."""
+
+        # initialize MPIManager and config check in the MPIManager ctor
+        mpi_manager = MPIManager()
+
+        # check if number of GPUs per node is larger than number of processes per node
+        numDevices = self._dp.getTotalDevices()
+        numProcsNode = mpi_manager.getSizeNode()
+
+        if numDevices < numProcsNode:
+            raise ValueError(
+                "Number of devices should be larger than or equal to the number of processes on each node."
+            )
+
+        # check if the process number is larger than number of statevector elements
+        if mpi_manager.getSize() > (1 << (num_wires - 1)):
+            raise ValueError(
+                "Number of processes should be smaller than the number of statevector elements."
+            )
+
+        # set GPU device
+        rank = mpi_manager.getRank()
+        deviceid = rank % numProcsNode
+        self._dp.setDeviceID(deviceid)
+        devtag = DevTag(deviceid)
+
+        return (mpi_manager, devtag)
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
new file mode 100644
index 000000000..d550fcb28
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -0,0 +1,337 @@
+# Copyright 2018-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Class implementation for lightning_gpu state-vector manipulation.
+"""
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import StateVectorC64, StateVectorC128
+
+    try:  # Try to import the MPI modules
+        from pennylane_lightning.lightning_gpu_ops import StateVectorMPIC64, StateVectorMPIC128
+
+        mpi_error = None
+        MPI_SUPPORT = True
+    except ImportError as ex_mpi:
+        mpi_error = ex_mpi
+        MPI_SUPPORT = False
+
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+from itertools import product
+from typing import Union
+
+import numpy as np
+import pennylane as qml
+from pennylane import DeviceError
+from pennylane.ops.op_math import Adjoint
+from pennylane.wires import Wires
+
+# pylint: disable=ungrouped-imports
+from pennylane_lightning.core._serialize import global_phase_diagonal
+from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
+
+from ._mpi_handler import MPIHandler
+
+gate_cache_needs_hash = (
+    qml.BlockEncode,
+    qml.ControlledQubitUnitary,
+    qml.DiagonalQubitUnitary,
+    qml.MultiControlledX,
+    qml.OrbitalRotation,
+    qml.PSWAP,
+    qml.QubitUnitary,
+)
+
+
+class LightningGPUStateVector(LightningBaseStateVector):
+    """Lightning GPU state-vector class.
+
+    Interfaces with C++ python binding methods for state-vector manipulation.
+
+    Args:
+        num_wires(int): the number of wires to initialize the device with
+        dtype: Datatypes for state-vector representation. Must be one of
+            ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
+        device_name(string): state vector device name. Options: ["lightning.gpu"]
+        mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device.
+            Provides functionality to distribute the state-vector to multiple devices.
+        sync (bool): is host-device data copy synchronized or not.
+    """
+
+    def __init__(
+        self,
+        num_wires: int,
+        dtype: Union[np.complex128, np.complex64] = np.complex128,
+        mpi_handler: MPIHandler = None,
+        sync: bool = True,
+    ):
+
+        super().__init__(num_wires, dtype)
+
+        self._device_name = "lightning.gpu"
+
+        # Initialize GPU and MPI variables
+        if mpi_handler is None:
+            mpi_handler = MPIHandler(False, 0, num_wires, dtype)
+
+        self._num_global_wires = mpi_handler.num_global_wires
+        self._num_local_wires = mpi_handler.num_local_wires
+
+        self._mpi_handler = mpi_handler
+        self._sync = sync
+
+        # Initialize the state vector
+        if self._mpi_handler.use_mpi:  # using MPI
+            self._qubit_state = self._state_dtype()(
+                self._mpi_handler.mpi_manager,
+                self._mpi_handler.devtag,
+                self._mpi_handler.mpi_buf_size,
+                self._mpi_handler.num_global_wires,
+                self._mpi_handler.num_local_wires,
+            )
+        else:  # without MPI
+            self._qubit_state = self._state_dtype()(self.num_wires)
+
+        self.reset_state()
+
+    def _state_dtype(self):
+        """Binding to Lightning Managed state vector C++ class.
+
+        Returns: the state vector class
+        """
+        if self._mpi_handler.use_mpi:
+            if not MPI_SUPPORT:
+                warn(str(mpi_error), UserWarning)
+
+            return StateVectorMPIC128 if self.dtype == np.complex128 else StateVectorMPIC64
+
+        # without MPI
+        return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
+
+    def syncD2H(self, state_vector, use_async=False):
+        """Copy the state vector data on device to a state vector on the host provided by the user.
+        Args:
+            state_vector(array[complex]): the state vector array on host.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.c_type)
+        >>> dev.syncD2H(state_vector)
+        >>> print(state_vector)
+        [0.+0.j 1.+0.j]
+        """
+        self._qubit_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+
+    @property
+    def state(self):
+        """Copy the state vector data from the device to the host.
+
+        A state vector Numpy array is explicitly allocated on the host to store and return the data.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> print(dev.state)
+        [0.+0.j 1.+0.j]
+        """
+        state = np.zeros(2**self._num_local_wires, dtype=self.dtype)
+        self.syncD2H(state)
+        return state
+
+    def syncH2D(self, state_vector, use_async=False):
+        """Copy the state vector data on host provided by the user to the state vector on the device
+        Args:
+            state_vector(array[complex]): the state vector array on host.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=3)
+        >>> obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
+        >>> obs1 = qml.Identity(1)
+        >>> H = qml.Hamiltonian([1.0, 1.0], [obs1, obs])
+        >>> state_vector = np.array([0.0 + 0.0j, 0.0 + 0.1j, 0.1 + 0.1j, 0.1 + 0.2j,
+            0.2 + 0.2j, 0.3 + 0.3j, 0.3 + 0.4j, 0.4 + 0.5j,], dtype=np.complex64,)
+        >>> dev.syncH2D(state_vector)
+        >>> res = dev.expval(H)
+        >>> print(res)
+        1.0
+        """
+        self._qubit_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+
+    @staticmethod
+    def _asarray(arr, dtype=None):
+        arr = np.asarray(arr)  # arr is not copied
+
+        if arr.dtype.kind not in ["f", "c"]:
+            return arr
+
+        if not dtype:
+            dtype = arr.dtype
+
+        return arr
+
+    def _apply_state_vector(self, state, device_wires, use_async=False):
+        """Initialize the state vector on GPU with a specified state on host.
+        Note that any use of this method will introduce host-overheads.
+        Args:
+        state (array[complex]): normalized input state (on host) of length ``2**len(wires)``
+                or broadcasted state of shape ``(batch_size, 2**len(wires))``
+        device_wires (Wires): wires that get initialized in the state
+        use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+        Note: This function only supports synchronized memory copy from host to device.
+        """
+
+        if isinstance(state, self._qubit_state.__class__):
+            raise DeviceError("LightningGPU does not support allocate external state_vector.")
+
+            # TODO
+            # Create an implementation in the C++ backend and binding to be able
+            # to allocate memory for a new statevector and copy the data
+            # from an external state vector.
+            # state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
+            # state.getState(state_data)
+            # state = state_data
+
+        state = self._asarray(state, dtype=self.dtype)  # this operation on host
+        output_shape = [2] * self._num_local_wires
+
+        if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
+            # Initialize the entire device state with the input state
+            if self.num_wires == self._num_local_wires:
+                self.syncH2D(np.reshape(state, output_shape))
+                return
+            local_state = np.zeros(2**self._num_local_wires, dtype=self._dtype)
+            self._mpi_handler.mpi_manager.Scatter(state, local_state, 0)
+            self.syncH2D(np.reshape(local_state, output_shape))
+            return
+
+        # generate basis states on subset of qubits via the cartesian product
+        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
+
+        # get basis states to alter on full set of qubits
+        unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int)
+        unravelled_indices[:, device_wires] = basis_states
+
+        # get indices for which the state is changed to input state vector elements
+        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
+
+        # set the state vector on GPU with the unravelled_indices and their corresponding values
+        self._qubit_state.setStateVector(
+            ravelled_indices, state, use_async
+        )  # this operation on device
+
+    def _apply_lightning_controlled(self, operation):
+        """Apply an arbitrary controlled operation to the state tensor.
+
+        Args:
+            operation (~pennylane.operation.Operation): controlled operation to apply
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        control_wires = list(operation.control_wires)
+        control_values = operation.control_values
+        name = operation.name
+        # Apply GlobalPhase
+        inv = False
+        param = operation.parameters[0]
+        wires = self.wires.indices(operation.wires)
+        matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
+        state.apply(name, wires, inv, [[param]], matrix)
+
+    def _apply_lightning_midmeasure(self):
+        """Execute a MidMeasureMP operation and return the sample in mid_measurements.
+
+        Args:
+
+        Returns:
+            None
+        """
+        raise DeviceError("LightningGPU does not support Mid-circuit measurements.")
+
+    # pylint: disable=unused-argument
+    def _apply_lightning(
+        self, operations, mid_measurements: dict = None, postselect_mode: str = None
+    ):
+        """Apply a list of operations to the state vector.
+
+        Args:
+            operations (list[~pennylane.operation.Operation]): operations to apply
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots. Default is ``None``.
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        # Skip over identity operations instead of performing
+        # matrix multiplication with it.
+        for operation in operations:
+            if isinstance(operation, qml.Identity):
+                continue
+            if isinstance(operation, Adjoint):
+                name = operation.base.name
+                invert_param = True
+            else:
+                name = operation.name
+                invert_param = False
+            method = getattr(state, name, None)
+            wires = list(operation.wires)
+
+            if method is not None:  # apply specialized gate
+                param = operation.parameters
+                method(wires, invert_param, param)
+            elif isinstance(operation, qml.ops.Controlled) and isinstance(
+                operation.base, qml.GlobalPhase
+            ):  # apply n-controlled gate
+                # LGPU do not support the controlled gates except for GlobalPhase
+                self._apply_lightning_controlled(operation)
+            else:  # apply gate as a matrix
+                try:
+                    mat = qml.matrix(operation)
+                except AttributeError:  # pragma: no cover
+                    # To support older versions of PL
+                    mat = operation.matrix
+
+                r_dtype = np.float32 if self.dtype == np.complex64 else np.float64
+                param = (
+                    [[r_dtype(operation.hash)]]
+                    if isinstance(operation, gate_cache_needs_hash)
+                    else []
+                )
+                if len(mat) == 0:
+                    raise ValueError("Unsupported operation")
+
+                self._qubit_state.apply(
+                    name,
+                    wires,
+                    False,
+                    param,
+                    mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
+                )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 2894b999f..84d7dd31e 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -16,921 +16,496 @@
 This module contains the :class:`~.LightningGPU` class, a PennyLane simulator device that
 interfaces with the NVIDIA cuQuantum cuStateVec simulator library for GPU-enabled calculations.
 """
+from __future__ import annotations
 
 from ctypes.util import find_library
+from dataclasses import replace
 from importlib import util as imp_util
-from itertools import product
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 from warnings import warn
 
 import numpy as np
 import pennylane as qml
-from pennylane import BasisState, DeviceError, QuantumFunctionError, Rot, StatePrep, math
-from pennylane.measurements import Expectation, State
-from pennylane.ops.op_math import Adjoint
-from pennylane.wires import Wires
-from scipy.sparse import csr_matrix
-
-from pennylane_lightning.core._serialize import QuantumScriptSerializer, global_phase_diagonal
-from pennylane_lightning.core._version import __version__
-
-# pylint: disable=import-error, no-name-in-module, ungrouped-imports
-from pennylane_lightning.core.lightning_base import LightningBase
+from pennylane.devices import DefaultExecutionConfig, ExecutionConfig
+from pennylane.devices.default_qubit import adjoint_ops
+from pennylane.devices.modifiers import simulator_tracking, single_tape_support
+from pennylane.devices.preprocess import (
+    decompose,
+    mid_circuit_measurements,
+    no_sampling,
+    validate_adjoint_trainable_params,
+    validate_device_wires,
+    validate_measurements,
+    validate_observables,
+)
+from pennylane.measurements import MidMeasureMP
+from pennylane.operation import DecompositionUndefinedError, Operator, Tensor
+from pennylane.ops import Prod, SProd, Sum
+from pennylane.tape import QuantumScript
+from pennylane.transforms.core import TransformProgram
+from pennylane.typing import Result
+
+from pennylane_lightning.core.lightning_newAPI_base import (
+    LightningBase,
+    QuantumTape_or_Batch,
+    Result_or_ResultBatch,
+)
 
 try:
     from pennylane_lightning.lightning_gpu_ops import (
         DevPool,
-        MeasurementsC64,
-        MeasurementsC128,
-        StateVectorC64,
-        StateVectorC128,
         backend_info,
         get_gpu_arch,
         is_gpu_supported,
     )
-    from pennylane_lightning.lightning_gpu_ops.algorithms import (
-        AdjointJacobianC64,
-        AdjointJacobianC128,
-        create_ops_listC64,
-        create_ops_listC128,
-    )
-
-    try:
-        # pylint: disable=no-name-in-module
-        from pennylane_lightning.lightning_gpu_ops import (
-            DevTag,
-            MeasurementsMPIC64,
-            MeasurementsMPIC128,
-            MPIManager,
-            StateVectorMPIC64,
-            StateVectorMPIC128,
-        )
-        from pennylane_lightning.lightning_gpu_ops.algorithmsMPI import (
-            AdjointJacobianMPIC64,
-            AdjointJacobianMPIC128,
-            create_ops_listMPIC64,
-            create_ops_listMPIC128,
-        )
-
-        MPI_SUPPORT = True
-    except ImportError as ex:
-        warn(str(ex), UserWarning)
-        MPI_SUPPORT = False
-
-    if find_library("custatevec") is None and not imp_util.find_spec(
-        "cuquantum"
-    ):  # pragma: no cover
-        raise ImportError(
-            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
-        )
-    if not DevPool.getTotalDevices():  # pragma: no cover
-        raise ValueError("No supported CUDA-capable device found")
-
-    if not is_gpu_supported():  # pragma: no cover
-        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
 
     LGPU_CPP_BINARY_AVAILABLE = True
+
 except (ImportError, ValueError) as ex:
     warn(str(ex), UserWarning)
-    backend_info = None
     LGPU_CPP_BINARY_AVAILABLE = False
+    backend_info = None
 
-
-def _gpu_dtype(dtype, mpi=False):
-    if dtype not in [np.complex128, np.complex64]:  # pragma: no cover
-        raise ValueError(f"Data type is not supported for state-vector computation: {dtype}")
-    if mpi:
-        return StateVectorMPIC128 if dtype == np.complex128 else StateVectorMPIC64
-    return StateVectorC128 if dtype == np.complex128 else StateVectorC64
-
-
-def _adj_dtype(use_csingle, mpi=False):
-    if mpi:
-        return AdjointJacobianMPIC64 if use_csingle else AdjointJacobianMPIC128
-    return AdjointJacobianC64 if use_csingle else AdjointJacobianC128
-
-
-def _mebibytesToBytes(mebibytes):
-    return mebibytes * 1024 * 1024
-
-
-allowed_operations = {
-    "Identity",
-    "BasisState",
-    "QubitStateVector",
-    "StatePrep",
-    "QubitUnitary",
-    "ControlledQubitUnitary",
-    "MultiControlledX",
-    "DiagonalQubitUnitary",
-    "PauliX",
-    "PauliY",
-    "PauliZ",
-    "MultiRZ",
-    "GlobalPhase",
-    "C(GlobalPhase)",
-    "Hadamard",
-    "S",
-    "Adjoint(S)",
-    "T",
-    "Adjoint(T)",
-    "SX",
-    "Adjoint(SX)",
-    "CNOT",
-    "SWAP",
-    "ISWAP",
-    "PSWAP",
-    "Adjoint(ISWAP)",
-    "SISWAP",
-    "Adjoint(SISWAP)",
-    "SQISW",
-    "CSWAP",
-    "Toffoli",
-    "CY",
-    "CZ",
-    "PhaseShift",
-    "ControlledPhaseShift",
-    "RX",
-    "RY",
-    "RZ",
-    "Rot",
-    "CRX",
-    "CRY",
-    "CRZ",
-    "CRot",
-    "IsingXX",
-    "IsingYY",
-    "IsingZZ",
-    "IsingXY",
-    "SingleExcitation",
-    "SingleExcitationPlus",
-    "SingleExcitationMinus",
-    "DoubleExcitation",
-    "DoubleExcitationPlus",
-    "DoubleExcitationMinus",
-    "QubitCarry",
-    "QubitSum",
-    "OrbitalRotation",
-    "ECR",
-    "BlockEncode",
-    "C(BlockEncode)",
-}
-
-allowed_observables = {
-    "PauliX",
-    "PauliY",
-    "PauliZ",
-    "Hadamard",
-    "SparseHamiltonian",
-    "Hamiltonian",
-    "LinearCombination",
-    "Hermitian",
-    "Identity",
-    "Projector",
-    "Sum",
-    "Prod",
-    "SProd",
-}
-
-gate_cache_needs_hash = (
-    qml.BlockEncode,
-    qml.ControlledQubitUnitary,
-    qml.DiagonalQubitUnitary,
-    qml.MultiControlledX,
-    qml.OrbitalRotation,
-    qml.PSWAP,
-    qml.QubitUnitary,
+from ._adjoint_jacobian import LightningGPUAdjointJacobian
+from ._measurements import LightningGPUMeasurements
+from ._mpi_handler import MPIHandler
+from ._state_vector import LightningGPUStateVector
+
+# The set of supported operations.
+_operations = frozenset(
+    {
+        "Identity",
+        "QubitStateVector",
+        "QubitUnitary",
+        "ControlledQubitUnitary",
+        "MultiControlledX",
+        "DiagonalQubitUnitary",
+        "PauliX",
+        "PauliY",
+        "PauliZ",
+        "MultiRZ",
+        "GlobalPhase",
+        "C(GlobalPhase)",
+        "Hadamard",
+        "S",
+        "Adjoint(S)",
+        "T",
+        "Adjoint(T)",
+        "SX",
+        "Adjoint(SX)",
+        "CNOT",
+        "SWAP",
+        "ISWAP",
+        "PSWAP",
+        "Adjoint(ISWAP)",
+        "SISWAP",
+        "Adjoint(SISWAP)",
+        "SQISW",
+        "CSWAP",
+        "Toffoli",
+        "CY",
+        "CZ",
+        "PhaseShift",
+        "ControlledPhaseShift",
+        "RX",
+        "RY",
+        "RZ",
+        "Rot",
+        "CRX",
+        "CRY",
+        "CRZ",
+        "CRot",
+        "IsingXX",
+        "IsingYY",
+        "IsingZZ",
+        "IsingXY",
+        "SingleExcitation",
+        "SingleExcitationPlus",
+        "SingleExcitationMinus",
+        "DoubleExcitation",
+        "DoubleExcitationPlus",
+        "DoubleExcitationMinus",
+        "QubitCarry",
+        "QubitSum",
+        "OrbitalRotation",
+        "ECR",
+        "BlockEncode",
+        "C(BlockEncode)",
+    }
+)
+# End the set of supported operations.
+
+# The set of supported observables.
+_observables = frozenset(
+    {
+        "PauliX",
+        "PauliY",
+        "PauliZ",
+        "Hadamard",
+        "SparseHamiltonian",
+        "Hamiltonian",
+        "LinearCombination",
+        "Hermitian",
+        "Identity",
+        "Projector",
+        "Sum",
+        "Prod",
+        "SProd",
+    }
 )
 
 
-class LightningGPU(LightningBase):  # pylint: disable=too-many-instance-attributes
-    """PennyLane Lightning GPU device.
+def stopping_condition(op: Operator) -> bool:
+    """A function that determines whether or not an operation is supported by ``lightning.gpu``."""
+    # To avoid building matrices beyond the given thresholds.
+    # This should reduce runtime overheads for larger systems.
+    if isinstance(op, qml.QFT):
+        return len(op.wires) < 10
+    if isinstance(op, qml.GroverOperator):
+        return len(op.wires) < 13
+    if isinstance(op, qml.PauliRot):
+        return False
 
-    A GPU-backed Lightning device using NVIDIA cuQuantum SDK.
+    return op.name in _operations
 
-    Use of this device requires pre-built binaries or compilation from source. Check out the
-    :doc:`/lightning_gpu/installation` guide for more details.
 
-    Args:
-        wires (int): the number of wires to initialize the device with
-        mpi (bool): enable MPI support. MPI support will be enabled if ``mpi`` is set as``True``.
-        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
-        sync (bool): immediately sync with host-sv after applying operations
-        c_dtype: Datatypes for statevector representation. Must be one of ``np.complex64`` or ``np.complex128``.
-        shots (int): How many times the circuit should be evaluated (or sampled) to estimate
-            the expectation values. Defaults to ``None`` if not specified. Setting
-            to ``None`` results in computing statistics like expectation values and
-            variances analytically.
-        batch_obs (Union[bool, int]): determine whether to use multiple GPUs within the same node or not
-    """
+def stopping_condition_shots(op: Operator) -> bool:
+    """A function that determines whether or not an operation is supported by ``lightning.gpu``
+    with finite shots."""
+    if isinstance(op, (MidMeasureMP, qml.ops.op_math.Conditional)):
+        # LightningGPU does not support Mid-circuit measurements.
+        return False
+    return stopping_condition(op)
 
-    name = "Lightning GPU PennyLane plugin"
-    short_name = "lightning.gpu"
 
-    operations = allowed_operations
-    observables = allowed_observables
-    _backend_info = backend_info
-    config = Path(__file__).parent / "lightning_gpu.toml"
-    _CPP_BINARY_AVAILABLE = LGPU_CPP_BINARY_AVAILABLE
+def accepted_observables(obs: Operator) -> bool:
+    """A function that determines whether or not an observable is supported by ``lightning.gpu``."""
+    return obs.name in _observables
 
-    def __init__(
-        self,
-        wires,
-        *,
-        mpi: bool = False,
-        mpi_buf_size: int = 0,
-        sync=False,
-        c_dtype=np.complex128,
-        shots=None,
-        batch_obs: Union[bool, int] = False,
-    ):  # pylint: disable=too-many-arguments
-        if c_dtype is np.complex64:
-            self.use_csingle = True
-        elif c_dtype is np.complex128:
-            self.use_csingle = False
-        else:
-            raise TypeError(f"Unsupported complex type: {c_dtype}")
-
-        super().__init__(wires, shots=shots, c_dtype=c_dtype)
 
-        self._dp = DevPool()
+def adjoint_observables(obs: Operator) -> bool:
+    """A function that determines whether or not an observable is supported by ``lightning.gpu``
+    when using the adjoint differentiation method."""
+    if isinstance(obs, qml.Projector):
+        return False
 
-        if not mpi:
-            self._mpi = False
-            self._num_local_wires = self.num_wires
-            self._gpu_state = _gpu_dtype(c_dtype)(self._num_local_wires)
-        else:
-            self._mpi = True
-            self._mpi_init_helper(self.num_wires)
-
-            if mpi_buf_size < 0:
-                raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}")
-
-            if mpi_buf_size:
-                if mpi_buf_size & (mpi_buf_size - 1):
-                    raise TypeError(
-                        f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
-                    )
-                # Memory size in bytes
-                sv_memsize = np.dtype(c_dtype).itemsize * (1 << self._num_local_wires)
-                if _mebibytesToBytes(mpi_buf_size) > sv_memsize:
-                    w_msg = "The MPI buffer size is larger than the local state vector size."
-                    warn(
-                        w_msg,
-                        RuntimeWarning,
-                    )
-
-            self._gpu_state = _gpu_dtype(c_dtype, mpi)(
-                self._mpi_manager,
-                self._devtag,
-                mpi_buf_size,
-                self._num_global_wires,
-                self._num_local_wires,
-            )
+    if isinstance(obs, Tensor):
+        if any(isinstance(o, qml.Projector) for o in obs.non_identity_obs):
+            return False
+        return True
 
-        self._sync = sync
-        self._batch_obs = batch_obs
-        self._create_basis_state(0)
-
-    def _mpi_init_helper(self, num_wires):
-        """Set up MPI checks."""
-        if not MPI_SUPPORT:
-            raise ImportError("MPI related APIs are not found.")
-        # initialize MPIManager and config check in the MPIManager ctor
-        self._mpi_manager = MPIManager()
-        # check if number of GPUs per node is larger than
-        # number of processes per node
-        numDevices = self._dp.getTotalDevices()
-        numProcsNode = self._mpi_manager.getSizeNode()
-        if numDevices < numProcsNode:
-            raise ValueError(
-                "Number of devices should be larger than or equal to the number of processes on each node."
-            )
-        # check if the process number is larger than number of statevector elements
-        if self._mpi_manager.getSize() > (1 << (num_wires - 1)):
-            raise ValueError(
-                "Number of processes should be smaller than the number of statevector elements."
-            )
-        # set the number of global and local wires
-        commSize = self._mpi_manager.getSize()
-        self._num_global_wires = commSize.bit_length() - 1
-        self._num_local_wires = num_wires - self._num_global_wires
-        # set GPU device
-        rank = self._mpi_manager.getRank()
-        deviceid = rank % numProcsNode
-        self._dp.setDeviceID(deviceid)
-        self._devtag = DevTag(deviceid)
-
-    @staticmethod
-    def _asarray(arr, dtype=None):
-        arr = np.asarray(arr)  # arr is not copied
-
-        if arr.dtype.kind not in ["f", "c"]:
-            return arr
-
-        if not dtype:
-            dtype = arr.dtype
-
-        return arr
-
-    # pylint disable=missing-function-docstring
-    def reset(self):
-        """Reset the device"""
-        super().reset()
-        # init the state vector to |00..0>
-        self._gpu_state.resetGPU(False)  # Sync reset
+    if isinstance(obs, SProd):
+        return adjoint_observables(obs.base)
 
-    @property
-    def state(self):
-        # pylint disable=missing-function-docstring
-        """Copy the state vector data from the device to the host.
+    if isinstance(obs, (Sum, Prod)):
+        return all(adjoint_observables(o) for o in obs)
 
-        A state vector Numpy array is explicitly allocated on the host to store and return the data.
+    return obs.name in _observables
 
-        **Example**
 
-        >>> dev = qml.device('lightning.gpu', wires=1)
-        >>> dev.apply([qml.PauliX(wires=[0])])
-        >>> print(dev.state)
-        [0.+0.j 1.+0.j]
-        """
-        state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
-        state = self._asarray(state, dtype=self.C_DTYPE)
-        self.syncD2H(state)
-        return state
+def adjoint_measurements(mp: qml.measurements.MeasurementProcess) -> bool:
+    """Specifies whether or not an observable is compatible with adjoint differentiation on DefaultQubit."""
+    return isinstance(mp, qml.measurements.ExpectationMP)
 
-    @property
-    def create_ops_list(self):
-        """Returns create_ops_list function of the matching precision."""
-        if self._mpi:
-            return create_ops_listMPIC64 if self.use_csingle else create_ops_listMPIC128
-        return create_ops_listC64 if self.use_csingle else create_ops_listC128
 
-    @property
-    def measurements(self):
-        """Returns Measurements constructor of the matching precision."""
-        if self._mpi:
-            return (
-                MeasurementsMPIC64(self._gpu_state)
-                if self.use_csingle
-                else MeasurementsMPIC128(self._gpu_state)
-            )
-        return (
-            MeasurementsC64(self._gpu_state)
-            if self.use_csingle
-            else MeasurementsC128(self._gpu_state)
-        )
+def _supports_adjoint(circuit):
+    if circuit is None:
+        return True
 
-    def syncD2H(self, state_vector, use_async=False):
-        """Copy the state vector data on device to a state vector on the host provided by the user
-        Args:
-            state_vector(array[complex]): the state vector array on host
-            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-            Note: This function only supports synchronized memory copy.
-
-        **Example**
-        >>> dev = qml.device('lightning.gpu', wires=1)
-        >>> dev.apply([qml.PauliX(wires=[0])])
-        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.C_DTYPE)
-        >>> dev.syncD2H(state_vector)
-        >>> print(state_vector)
-        [0.+0.j 1.+0.j]
-        """
-        self._gpu_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+    prog = TransformProgram()
+    _add_adjoint_transforms(prog)
 
-    def syncH2D(self, state_vector, use_async=False):
-        """Copy the state vector data on host provided by the user to the state vector on the device
-        Args:
-            state_vector(array[complex]): the state vector array on host.
-            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-            Note: This function only supports synchronized memory copy.
-
-        **Example**
-        >>> dev = qml.device('lightning.gpu', wires=3)
-        >>> obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
-        >>> obs1 = qml.Identity(1)
-        >>> H = qml.Hamiltonian([1.0, 1.0], [obs1, obs])
-        >>> state_vector = np.array([0.0 + 0.0j, 0.0 + 0.1j, 0.1 + 0.1j, 0.1 + 0.2j,
-            0.2 + 0.2j, 0.3 + 0.3j, 0.3 + 0.4j, 0.4 + 0.5j,], dtype=np.complex64,)
-        >>> dev.syncH2D(state_vector)
-        >>> res = dev.expval(H)
-        >>> print(res)
-        1.0
-        """
-        self._gpu_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+    try:
+        prog((circuit,))
+    except (DecompositionUndefinedError, qml.DeviceError, AttributeError):
+        return False
+    return True
 
-    def _create_basis_state(self, index, use_async=False):
-        """Return a computational basis state over all wires.
-        Args:
-            index (int): integer representing the computational basis state.
-            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-            Note: This function only supports synchronized memory copy.
-        """
-        self._gpu_state.setBasisState(index, use_async)
 
-    def _apply_state_vector(self, state, device_wires, use_async=False):
-        """Initialize the state vector on GPU with a specified state on host.
-        Note that any use of this method will introduce host-overheads.
-        Args:
-        state (array[complex]): normalized input state (on host) of length ``2**len(wires)``
-                or broadcasted state of shape ``(batch_size, 2**len(wires))``
-        device_wires (Wires): wires that get initialized in the state
-        use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-        Note: This function only supports synchronized memory copy from host to device.
-        """
-        # translate to wire labels used by device
-        device_wires = self.map_wires(device_wires)
-
-        state = self._asarray(state, dtype=self.C_DTYPE)  # this operation on host
-        output_shape = [2] * self._num_local_wires
-
-        if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
-            # Initialize the entire device state with the input state
-            if self.num_wires == self._num_local_wires:
-                self.syncH2D(self._reshape(state, output_shape))
-                return
-            local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
-            self._mpi_manager.Scatter(state, local_state, 0)
-            # Initialize the entire device state with the input state
-            self.syncH2D(self._reshape(local_state, output_shape))
-            return
-
-        # generate basis states on subset of qubits via the cartesian product
-        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
-
-        # get basis states to alter on full set of qubits
-        unravelled_indices = np.zeros((2 ** len(device_wires), self.num_wires), dtype=int)
-        unravelled_indices[:, device_wires] = basis_states
-
-        # get indices for which the state is changed to input state vector elements
-        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
-
-        # set the state vector on GPU with the unravelled_indices and their corresponding values
-        self._gpu_state.setStateVector(
-            ravelled_indices, state, use_async
-        )  # this operation on device
-
-    def _apply_basis_state(self, state, wires):
-        """Initialize the state vector in a specified computational basis state on GPU directly.
-            Args:
-            state (array[int]): computational basis state (on host) of shape ``(wires,)``
-                consisting of 0s and 1s.
-            wires (Wires): wires that the provided computational state should be initialized on
-        Note: This function does not support broadcasted inputs yet.
-        """
-        # translate to wire labels used by device
-        device_wires = self.map_wires(wires)
+def _adjoint_ops(op: qml.operation.Operator) -> bool:
+    """Specify whether or not an Operator is supported by adjoint differentiation."""
+    return not isinstance(op, qml.PauliRot) and adjoint_ops(op)
 
-        # length of basis state parameter
-        n_basis_state = len(state)
-        state = state.tolist() if hasattr(state, "tolist") else state
-        if not set(state).issubset({0, 1}):
-            raise ValueError("BasisState parameter must consist of 0 or 1 integers.")
 
-        if n_basis_state != len(device_wires):
-            raise ValueError("BasisState parameter and wires must be of equal length.")
+def _add_adjoint_transforms(program: TransformProgram) -> None:
+    """Private helper function for ``preprocess`` that adds the transforms specific
+    for adjoint differentiation.
 
-        # get computational basis state number
-        basis_states = 2 ** (self.num_wires - 1 - np.array(device_wires))
-        basis_states = qml.math.convert_like(basis_states, state)
-        num = int(qml.math.dot(state, basis_states))
+    Args:
+        program (TransformProgram): where we will add the adjoint differentiation transforms
 
-        self._create_basis_state(num)
+    Side Effects:
+        Adds transforms to the input program.
 
-    def apply_lightning(self, operations):
-        """Apply a list of operations to the state tensor.
+    """
 
-        Args:
-            operations (list[~pennylane.operation.Operation]): operations to apply
-            dtype (type): Type of numpy ``complex`` to be used. Can be important
-            to specify for large systems for memory allocation purposes.
+    name = "adjoint + lightning.gpu"
+    program.add_transform(no_sampling, name=name)
+    program.add_transform(
+        decompose,
+        stopping_condition=_adjoint_ops,
+        stopping_condition_shots=stopping_condition_shots,
+        name=name,
+        skip_initial_state_prep=False,
+    )
+    program.add_transform(validate_observables, accepted_observables, name=name)
+    program.add_transform(
+        validate_measurements, analytic_measurements=adjoint_measurements, name=name
+    )
+    program.add_transform(qml.transforms.broadcast_expand)
+    program.add_transform(validate_adjoint_trainable_params)
 
-        Returns:
-            array[complex]: the output state tensor
-        """
-        # Skip over identity operations instead of performing
-        # matrix multiplication with the identity.
-        for ops in operations:
-            if isinstance(ops, qml.Identity):
-                continue
-            if isinstance(ops, Adjoint):
-                name = ops.base.name
-                invert_param = True
-            else:
-                name = ops.name
-                invert_param = False
-            method = getattr(self._gpu_state, name, None)
-            wires = self.wires.indices(ops.wires)
-
-            if isinstance(ops, qml.ops.op_math.Controlled) and isinstance(
-                ops.base, qml.GlobalPhase
-            ):
-                controls = ops.control_wires
-                control_values = ops.control_values
-                param = ops.base.parameters[0]
-                matrix = global_phase_diagonal(param, self.wires, controls, control_values)
-                self._gpu_state.apply(name, wires, False, [], matrix)
-            elif method is None:
-                # Inverse can be set to False since qml.matrix(ops) is already in inverted form
-                try:
-                    mat = qml.matrix(ops)
-                except AttributeError:  # pragma: no cover
-                    # To support older versions of PL
-                    mat = ops.matrix
-                r_dtype = np.float32 if self.use_csingle else np.float64
-                param = [[r_dtype(ops.hash)]] if isinstance(ops, gate_cache_needs_hash) else []
-                if len(mat) == 0:
-                    raise ValueError("Unsupported operation")
-                self._gpu_state.apply(
-                    name,
-                    wires,
-                    False,
-                    param,
-                    mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
-                )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
-
-            else:
-                param = ops.parameters
-                method(wires, invert_param, param)
 
-    # pylint: disable=unused-argument
-    def apply(self, operations, rotations=None, **kwargs):
-        """Applies a list of operations to the state tensor."""
-        # State preparation is currently done in Python
-        if operations:  # make sure operations[0] exists
-            if isinstance(operations[0], StatePrep):
-                self._apply_state_vector(operations[0].parameters[0].copy(), operations[0].wires)
-                operations = operations[1:]
-            elif isinstance(operations[0], BasisState):
-                self._apply_basis_state(operations[0].parameters[0], operations[0].wires)
-                operations = operations[1:]
-
-        for operation in operations:
-            if isinstance(operation, (StatePrep, BasisState)):
-                raise DeviceError(
-                    f"Operation {operation.name} cannot be used after other "
-                    + f"Operations have already been applied on a {self.short_name} device."
-                )
+# LightningGPU specific methods
+def check_gpu_resources() -> None:
+    """Check the available resources of each Nvidia GPU"""
+    if find_library("custatevec") is None and not imp_util.find_spec("cuquantum"):
 
-        self.apply_lightning(operations)
+        raise ImportError(
+            "cuStateVec libraries not found. Please pip install the appropriate cuStateVec library in a virtual environment."
+        )
 
-    @staticmethod
-    def _check_adjdiff_supported_operations(operations):
-        """Check Lightning adjoint differentiation method support for a tape.
+    if not DevPool.getTotalDevices():
+        raise ValueError("No supported CUDA-capable device found")
 
-        Raise ``QuantumFunctionError`` if ``tape`` contains not supported measurements,
-        observables, or operations by the Lightning adjoint differentiation method.
+    if not is_gpu_supported():
+        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
 
-        Args:
-            tape (.QuantumTape): quantum tape to differentiate.
-        """
-        for op in operations:
-            if op.num_params > 1 and not isinstance(op, Rot):
-                raise QuantumFunctionError(
-                    f"The {op.name} operation is not supported using "
-                    'the "adjoint" differentiation method'
-                )
 
-    def _init_process_jacobian_tape(self, tape, starting_state, use_device_state):
-        """Generate an initial state vector for ``_process_jacobian_tape``."""
-        if starting_state is not None:
-            if starting_state.size != 2 ** len(self.wires):
-                raise QuantumFunctionError(
-                    "The number of qubits of starting_state must be the same as "
-                    "that of the device."
-                )
-            self._apply_state_vector(starting_state, self.wires)
-        elif not use_device_state:
-            self.reset()
-            self.apply(tape.operations)
-        return self._gpu_state
-
-    # pylint: disable=too-many-branches
-    def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
-        """Implements the adjoint method outlined in
-        `Jones and Gacon <https://arxiv.org/abs/2009.02823>`__ to differentiate an input tape.
-
-        After a forward pass, the circuit is reversed by iteratively applying adjoint
-        gates to scan backwards through the circuit.
-        """
-        if self.shots is not None:
-            warn(
-                "Requested adjoint differentiation to be computed with finite shots."
-                " The derivative is always exact when using the adjoint differentiation method.",
-                UserWarning,
-            )
+@simulator_tracking
+@single_tape_support
+class LightningGPU(LightningBase):
+    """PennyLane Lightning GPU device.
 
-        tape_return_type = self._check_adjdiff_supported_measurements(tape.measurements)
+    A device that interfaces with C++ to perform fast linear algebra calculations.
 
-        if not tape_return_type:  # the tape does not have measurements
-            return np.array([], dtype=self.state.dtype)
+    Use of this device requires pre-built binaries or compilation from source. Check out the
+    :doc:`/lightning_gpu/installation` guide for more details.
 
-        if tape_return_type is State:  # pragma: no cover
-            raise QuantumFunctionError(
-                "Adjoint differentiation method does not support measurement StateMP."
-                "Use vjp method instead for this purpose."
-            )
+    Args:
+        wires (int): the number of wires to initialize the device with
+        c_dtype: Datatypes for statevector representation. Must be one of
+            ``np.complex64`` or ``np.complex128``.
+        shots (int): How many times the circuit should be evaluated (or sampled) to estimate
+            the expectation values. Defaults to ``None`` if not specified. Setting
+            to ``None`` results in computing statistics like expectation values and
+            variances analytically.
+        batch_obs (bool): Determine whether we process observables in parallel when
+            computing the jacobian. This value is only relevant when the lightning.gpu
+            is built with MPI. Default is False.
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        sync (bool): is host-device data copy synchronized or not.
+    """
 
-        # Check adjoint diff support
-        self._check_adjdiff_supported_operations(tape.operations)
+    # General device options
+    _device_options = ("c_dtype", "batch_obs")
 
-        if self._mpi:
-            split_obs = False  # with MPI batched means compute Jacobian one observables at a time, no point splitting linear combinations
-        else:
-            split_obs = self._dp.getTotalDevices() if self._batch_obs else False
-        processed_data = self._process_jacobian_tape(
-            tape, starting_state, use_device_state, self._mpi, split_obs
-        )
+    # Device specific options
+    _CPP_BINARY_AVAILABLE = LGPU_CPP_BINARY_AVAILABLE
+    _backend_info = backend_info if LGPU_CPP_BINARY_AVAILABLE else None
 
-        if not processed_data:  # training_params is empty
-            return np.array([], dtype=self.state.dtype)
+    # This `config` is used in Catalyst-Frontend
+    config = Path(__file__).parent / "lightning_gpu.toml"
 
-        trainable_params = processed_data["tp_shift"]
-        # pylint: disable=pointless-string-statement
-        """
-        This path enables controlled batching over the requested observables, be they explicit, or part of a Hamiltonian.
-        The traditional path will assume there exists enough free memory to preallocate all arrays and run through each observable iteratively.
-        However, for larger system, this becomes impossible, and we hit memory issues very quickly. the batching support here enables several functionalities:
-        - Pre-allocate memory for all observables on the primary GPU (`batch_obs=False`, default behaviour): This is the simplest path, and works best for few observables, and moderate qubit sizes. All memory is preallocated for each observable, and run through iteratively on a single GPU.
-        - Evenly distribute the observables over all available GPUs (`batch_obs=True`): This will evenly split the data into ceil(num_obs/num_gpus) chunks, and allocate enough space on each GPU up-front before running through them concurrently. This relies on C++ threads to handle the orchestration.
-        - Allocate at most `n` observables per GPU (`batch_obs=n`): Providing an integer value restricts each available GPU to at most `n` copies of the statevector, and hence `n` given observables for a given batch. This will iterate over the data in chnuks of size `n*num_gpus`.
-        """
-        adjoint_jacobian = _adj_dtype(self.use_csingle, self._mpi)()
-
-        if self._batch_obs:  # Batching of Measurements
-            jac = adjoint_jacobian.batched(
-                self._gpu_state,
-                processed_data["obs_serialized"],
-                processed_data["ops_serialized"],
-                trainable_params,
-            )
-        else:
-            jac = adjoint_jacobian(
-                self._gpu_state,
-                processed_data["obs_serialized"],
-                processed_data["ops_serialized"],
-                trainable_params,
-            )
-        jac = np.array(jac)
-        has_shape0 = bool(len(jac))
+    # TODO: Move supported ops/obs to TOML file
+    operations = _operations
+    # The names of the supported operations.
 
-        num_obs = len(np.unique(processed_data["obs_indices"]))
-        rows = processed_data["obs_indices"]
-        cols = np.arange(len(rows), dtype=int)
-        data = np.ones(len(rows))
-        red_mat = csr_matrix((data, (rows, cols)), shape=(num_obs, len(rows)))
-        jac = red_mat @ jac.reshape((len(rows), -1))
-        jac = jac.reshape(-1, len(trainable_params)) if has_shape0 else jac
-        jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
-        jac_r[:, processed_data["record_tp_rows"]] = jac
-        return self._adjoint_jacobian_processing(jac_r)
+    observables = _observables
+    # The names of the supported observables.
 
-    # pylint: disable=inconsistent-return-statements, line-too-long, missing-function-docstring
-    def vjp(self, measurements, grad_vec, starting_state=None, use_device_state=False):
-        """Generate the processing function required to compute the vector-Jacobian products
-        of a tape.
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        wires: Union[int, List],
+        *,
+        c_dtype: Union[np.complex128, np.complex64] = np.complex128,
+        shots: Union[int, List] = None,
+        batch_obs: bool = False,
+        # GPU and MPI arguments
+        mpi: bool = False,
+        mpi_buf_size: int = 0,
+        sync: bool = False,
+    ):
+        if not self._CPP_BINARY_AVAILABLE:
+            raise ImportError(
+                "Pre-compiled binaries for lightning.gpu are not available. "
+                "To manually compile from source, follow the instructions at "
+                "https://docs.pennylane.ai/projects/lightning/en/stable/dev/installation.html."
+            )
 
-        This function can be used with multiple expectation values or a quantum state.
-        When a quantum state is given,
+        check_gpu_resources()
 
-        .. code-block:: python
+        super().__init__(
+            wires=wires,
+            c_dtype=c_dtype,
+            shots=shots,
+            batch_obs=batch_obs,
+        )
 
-            vjp_f = dev.vjp([qml.state()], grad_vec)
-            vjp = vjp_f(tape)
+        # Set the attributes to call the LightningGPU classes
+        self._set_lightning_classes()
 
-        computes :math:`w = (w_1,\\cdots,w_m)` where
+        # GPU specific options
+        self._dp = DevPool()
+        self._sync = sync
 
-        .. math::
+        # Creating the state vector
+        self._mpi_handler = MPIHandler(mpi, mpi_buf_size, len(self.wires), c_dtype)
 
-            w_k = \\langle v| \\frac{\\partial}{\\partial \\theta_k} | \\psi_{\\pmb{\\theta}} \\rangle.
+        self._statevector = self.LightningStateVector(
+            num_wires=len(self.wires), dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
+        )
 
-        Here, :math:`m` is the total number of trainable parameters,
-        :math:`\\pmb{\\theta}` is the vector of trainable parameters and
-        :math:`\\psi_{\\pmb{\\theta}}` is the output quantum state.
+    @property
+    def name(self):
+        """The name of the device."""
+        return "lightning.gpu"
 
-        Args:
-            measurements (list): List of measurement processes for vector-Jacobian product.
-                Now it must be expectation values or a quantum state.
-            grad_vec (tensor_like): Gradient-output vector. Must have shape matching the output
-                shape of the corresponding tape, i.e. number of measurements if the return
-                type is expectation or :math:`2^N` if the return type is statevector
-            starting_state (tensor_like): post-forward pass state to start execution with.
-                It should be complex-valued. Takes precedence over ``use_device_state``.
-            use_device_state (bool): use current device state to initialize.
-                A forward pass of the same circuit should be the last thing the device
-                has executed. If a ``starting_state`` is provided, that takes precedence.
+    def _set_lightning_classes(self):
+        """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
+        self.LightningStateVector = LightningGPUStateVector
+        self.LightningMeasurements = LightningGPUMeasurements
+        self.LightningAdjointJacobian = LightningGPUAdjointJacobian
 
-        Returns:
-            The processing function required to compute the vector-Jacobian products of a tape.
+    def _setup_execution_config(self, config):
         """
-        if self.shots is not None:
-            warn(
-                "Requested adjoint differentiation to be computed with finite shots."
-                " The derivative is always exact when using the adjoint differentiation method.",
-                UserWarning,
-            )
-
-        tape_return_type = self._check_adjdiff_supported_measurements(measurements)
-
-        if math.allclose(grad_vec, 0) or tape_return_type is None:
-            return lambda tape: math.convert_like(np.zeros(len(tape.trainable_params)), grad_vec)
+        Update the execution config with choices for how the device should be used and the device options.
+        """
+        updated_values = {}
+        if config.gradient_method == "best":
+            updated_values["gradient_method"] = "adjoint"
+        if config.use_device_gradient is None:
+            updated_values["use_device_gradient"] = config.gradient_method in ("best", "adjoint")
+        if config.grad_on_execution is None:
+            updated_values["grad_on_execution"] = True
 
-        if tape_return_type is Expectation:
-            if len(grad_vec) != len(measurements):
-                raise ValueError(
-                    "Number of observables in the tape must be the same as the length of grad_vec in the vjp method"
-                )
+        new_device_options = dict(config.device_options)
+        for option in self._device_options:
+            if option not in new_device_options:
+                new_device_options[option] = getattr(self, f"_{option}", None)
 
-            if np.iscomplexobj(grad_vec):
-                raise ValueError(
-                    "The vjp method only works with a real-valued grad_vec when the tape is returning an expectation value"
-                )
+        # It is necessary to set the mcmc default configuration to complete the requirements of ExecuteConfig
+        mcmc_default = {"mcmc": False, "kernel_name": None, "num_burnin": 0, "rng": None}
+        new_device_options.update(mcmc_default)
 
-            ham = qml.Hamiltonian(grad_vec, [m.obs for m in measurements])
+        return replace(config, **updated_values, device_options=new_device_options)
 
-            # pylint: disable=protected-access
-            def processing_fn(tape):
-                nonlocal ham
-                num_params = len(tape.trainable_params)
+    def preprocess(self, execution_config: ExecutionConfig = DefaultExecutionConfig):
+        """This function defines the device transform program to be applied and an updated device configuration.
 
-                if num_params == 0:
-                    return np.array([], dtype=self.state.dtype)
+        Args:
+            execution_config (Union[ExecutionConfig, Sequence[ExecutionConfig]]): A data structure describing the
+                parameters needed to fully describe the execution.
 
-                new_tape = tape.copy()
-                new_tape._measurements = [qml.expval(ham)]
+        Returns:
+            TransformProgram, ExecutionConfig: A transform program that when called returns :class:`~.QuantumTape`'s that the
+            device can natively execute as well as a postprocessing function to be called after execution, and a configuration
+            with unset specifications filled in.
 
-                return self.adjoint_jacobian(new_tape, starting_state, use_device_state)
+        This device:
 
-            return processing_fn
+        * Supports any qubit operations that provide a matrix
+        * Currently does not support finite shots
+        * Currently does not intrinsically support parameter broadcasting
 
-    # pylint: disable=attribute-defined-outside-init
-    def sample(self, observable, shot_range=None, bin_size=None, counts=False):
-        """Return samples of an observable."""
-        diagonalizing_gates = observable.diagonalizing_gates()
-        if diagonalizing_gates:
-            self.apply(diagonalizing_gates)
-        if not isinstance(observable, qml.PauliZ):
-            self._samples = self.generate_samples()
-        results = super().sample(
-            observable, shot_range=shot_range, bin_size=bin_size, counts=counts
+        """
+        exec_config = self._setup_execution_config(execution_config)
+        program = TransformProgram()
+
+        program.add_transform(validate_measurements, name=self.name)
+        program.add_transform(validate_observables, accepted_observables, name=self.name)
+        program.add_transform(validate_device_wires, self.wires, name=self.name)
+        program.add_transform(
+            mid_circuit_measurements, device=self, mcm_config=exec_config.mcm_config
         )
-        if diagonalizing_gates:
-            self.apply([qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)])
-        return results
-
-    def generate_samples(self):
-        """Generate samples
 
-        Returns:
-            array[int]: array of samples in binary representation with shape
-            ``(dev.shots, dev.num_wires)``
-        """
-        shots = self.shots if isinstance(self.shots, int) else self.shots.total_shots
+        program.add_transform(
+            decompose,
+            stopping_condition=stopping_condition,
+            stopping_condition_shots=stopping_condition_shots,
+            skip_initial_state_prep=True,
+            name=self.name,
+        )
+        program.add_transform(qml.transforms.broadcast_expand)
 
-        return self.measurements.generate_samples(len(self.wires), shots).astype(int, copy=False)
+        if exec_config.gradient_method == "adjoint":
+            _add_adjoint_transforms(program)
+        return program, exec_config
 
-    # pylint: disable=protected-access
-    def expval(self, observable, shot_range=None, bin_size=None):
-        """Expectation value of the supplied observable.
+    # pylint: disable=unused-argument
+    def execute(
+        self,
+        circuits: QuantumTape_or_Batch,
+        execution_config: ExecutionConfig = DefaultExecutionConfig,
+    ) -> Result_or_ResultBatch:
+        """Execute a circuit or a batch of circuits and turn it into results.
 
         Args:
-            observable: A PennyLane observable.
-            shot_range (tuple[int]): 2-tuple of integers specifying the range of samples
-                to use. If not specified, all samples are used.
-            bin_size (int): Divides the shot range into bins of size ``bin_size``, and
-                returns the measurement statistic separately over each bin. If not
-                provided, the entire shot range is treated as a single bin.
+            circuits (Union[QuantumTape, Sequence[QuantumTape]]): the quantum circuits to be executed
+            execution_config (ExecutionConfig): a datastructure with additional information required for execution
 
         Returns:
-            Expectation value of the observable
+            TensorLike, tuple[TensorLike], tuple[tuple[TensorLike]]: A numeric result of the computation.
         """
-        if isinstance(observable, qml.Projector):
-            diagonalizing_gates = observable.diagonalizing_gates()
-            if self.shots is None and diagonalizing_gates:
-                self.apply(diagonalizing_gates)
-            results = super().expval(observable, shot_range=shot_range, bin_size=bin_size)
-            if self.shots is None and diagonalizing_gates:
-                self.apply([qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)])
-            return results
-
-        if self.shots is not None:
-            # estimate the expectation value
-            samples = self.sample(observable, shot_range=shot_range, bin_size=bin_size)
-            return np.squeeze(np.mean(samples, axis=0))
-
-        if isinstance(observable, qml.SparseHamiltonian):
-            if self._mpi:
-                # Identity for CSR_SparseHamiltonian to pass to processes with rank != 0 to reduce
-                # host(cpu) memory requirements
-                obs = qml.Identity(0)
-                Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
-                H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
-                CSR_SparseHamiltonian = H_sparse.sparse_matrix().tocsr()
-                # CSR_SparseHamiltonian for rank == 0
-                if self._mpi_manager.getRank() == 0:
-                    CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
-            else:
-                CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
-
-            return self.measurements.expval(
-                CSR_SparseHamiltonian.indptr,
-                CSR_SparseHamiltonian.indices,
-                CSR_SparseHamiltonian.data,
-            )
-
-        # use specialized functors to compute expval(Hermitian)
-        if isinstance(observable, qml.Hermitian):
-            observable_wires = self.map_wires(observable.wires)
-            if self._mpi and len(observable_wires) > self._num_local_wires:
-                raise RuntimeError(
-                    "MPI backend does not support Hermitian with number of target wires larger than local wire number."
+        results = []
+        for circuit in circuits:
+            if self._wire_map is not None:
+                [circuit], _ = qml.map_wires(circuit, self._wire_map)
+            results.append(
+                self.simulate(
+                    circuit,
+                    self._statevector,
                 )
-            matrix = observable.matrix()
-            return self.measurements.expval(matrix, observable_wires)
-
-        if (
-            isinstance(observable, qml.ops.Hamiltonian)
-            or (observable.arithmetic_depth > 0)
-            or isinstance(observable.name, List)
-        ):
-            ob_serialized = QuantumScriptSerializer(
-                self.short_name, self.use_csingle, self._mpi
-            )._ob(observable, self.wire_map)
-            return self.measurements.expval(ob_serialized)
+            )
 
-        # translate to wire labels used by device
-        observable_wires = self.map_wires(observable.wires)
+        return tuple(results)
 
-        return self.measurements.expval(observable.name, observable_wires)
+    def supports_derivatives(
+        self,
+        execution_config: Optional[ExecutionConfig] = None,
+        circuit: Optional[qml.tape.QuantumTape] = None,
+    ) -> bool:
+        """Check whether or not derivatives are available for a given configuration and circuit.
 
-    def probability_lightning(self, wires=None):
-        """Return the probability of each computational basis state.
+        ``LightningGPU`` supports adjoint differentiation with analytic results.
 
         Args:
-            wires (Iterable[Number, str], Number, str, Wires): wires to return
-                marginal probabilities for. Wires not provided are traced out of the system.
+            execution_config (ExecutionConfig): The configuration of the desired derivative calculation
+            circuit (QuantumTape): An optional circuit to check derivatives support for.
 
         Returns:
-            array[float]: list of the probabilities
+            Bool: Whether or not a derivative can be calculated provided the given information
+
         """
-        # translate to wire labels used by device
-        observable_wires = self.map_wires(wires)
-        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
-        local_prob = self.measurements.probs(observable_wires)
-        if len(local_prob) > 0:
-            num_local_wires = len(local_prob).bit_length() - 1 if len(local_prob) > 0 else 0
-            return local_prob.reshape([2] * num_local_wires).transpose().reshape(-1)
-        return local_prob
-
-    def var(self, observable, shot_range=None, bin_size=None):
-        """Variance of the supplied observable.
+        if execution_config is None and circuit is None:
+            return True
+        if execution_config.gradient_method not in {"adjoint", "best"}:
+            return False
+        if circuit is None:
+            return True
+        return _supports_adjoint(circuit=circuit)
+
+    def simulate(
+        self,
+        circuit: QuantumScript,
+        state: LightningGPUStateVector,
+    ) -> Result:
+        """Simulate a single quantum script.
 
         Args:
-            observable: A PennyLane observable.
-            shot_range (tuple[int]): 2-tuple of integers specifying the range of samples
-                to use. If not specified, all samples are used.
-            bin_size (int): Divides the shot range into bins of size ``bin_size``, and
-                returns the measurement statistic separately over each bin. If not
-                provided, the entire shot range is treated as a single bin.
+            circuit (QuantumTape): The single circuit to simulate
+            state (LightningGPUStateVector): handle to Lightning state vector
 
         Returns:
-            Variance of the observable
-        """
-        if isinstance(observable, qml.Projector):
-            diagonalizing_gates = observable.diagonalizing_gates()
-            if self.shots is None and diagonalizing_gates:
-                self.apply(diagonalizing_gates)
-            results = super().var(observable, shot_range=shot_range, bin_size=bin_size)
-            if self.shots is None and diagonalizing_gates:
-                self.apply([qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)])
-            return results
-
-        if self.shots is not None:
-            # estimate the var
-            # Lightning doesn't support sampling yet
-            samples = self.sample(observable, shot_range=shot_range, bin_size=bin_size)
-            return np.squeeze(np.var(samples, axis=0))
-
-        if isinstance(observable, qml.SparseHamiltonian):
-            csr_hamiltonian = observable.sparse_matrix(wire_order=self.wires).tocsr(copy=False)
-            return self.measurements.var(
-                csr_hamiltonian.indptr,
-                csr_hamiltonian.indices,
-                csr_hamiltonian.data,
-            )
+            Tuple[TensorLike]: The results of the simulation
 
-        if (
-            isinstance(observable, (qml.Hermitian, qml.ops.Hamiltonian))
-            or (observable.arithmetic_depth > 0)
-            or isinstance(observable.name, List)
-        ):
-            ob_serialized = QuantumScriptSerializer(
-                self.short_name, self.use_csingle, self._mpi
-            )._ob(observable, self.wire_map)
-            return self.measurements.var(ob_serialized)
-
-        # translate to wire labels used by device
-        observable_wires = self.map_wires(observable.wires)
+        Note that this function can return measurements for non-commuting observables simultaneously.
+        """
+        if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
+            raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
 
-        return self.measurements.var(observable.name, observable_wires)
+        state.reset_state()
+        final_state = state.get_final_state(circuit)
+        return self.LightningMeasurements(final_state).measure_final_state(circuit)
diff --git a/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py b/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py
index 4338a5b87..bee481aac 100644
--- a/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py
+++ b/pennylane_lightning/lightning_kokkos/_adjoint_jacobian.py
@@ -15,6 +15,10 @@
 Internal methods for adjoint Jacobian differentiation method.
 """
 
+from __future__ import annotations
+
+from warnings import warn
+
 try:
     from pennylane_lightning.lightning_kokkos_ops.algorithms import (
         AdjointJacobianC64,
@@ -22,8 +26,8 @@
         create_ops_listC64,
         create_ops_listC128,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 import numpy as np
 from pennylane.tape import QuantumTape
@@ -31,8 +35,6 @@
 # pylint: disable=ungrouped-imports
 from pennylane_lightning.core._adjoint_jacobian_base import LightningBaseAdjointJacobian
 
-from ._state_vector import LightningKokkosStateVector
-
 
 class LightningKokkosAdjointJacobian(LightningBaseAdjointJacobian):
     """Check and execute the adjoint Jacobian differentiation method.
@@ -44,7 +46,11 @@ class LightningKokkosAdjointJacobian(LightningBaseAdjointJacobian):
 
     # pylint: disable=too-few-public-methods
 
-    def __init__(self, qubit_state: LightningKokkosStateVector, batch_obs: bool = False) -> None:
+    def __init__(
+        self,
+        qubit_state: LightningKokkosStateVector,  # pylint: disable=undefined-variable
+        batch_obs: bool = False,
+    ) -> None:
         super().__init__(qubit_state, batch_obs)
 
         # Initialize the C++ binds
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index b438af350..ee848739c 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -15,11 +15,14 @@
 Class implementation for state vector measurements.
 """
 
-# pylint: disable=import-error, no-name-in-module, ungrouped-imports
+from __future__ import annotations
+
+from warnings import warn
+
 try:
     from pennylane_lightning.lightning_kokkos_ops import MeasurementsC64, MeasurementsC128
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 from typing import List
 
@@ -28,6 +31,7 @@
 from pennylane.measurements import CountsMP, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
+# pylint: disable=ungrouped-imports
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
 
 
@@ -44,7 +48,7 @@ class LightningKokkosMeasurements(
 
     def __init__(
         self,
-        kokkos_state,
+        kokkos_state: LightningKokkosStateVector,  # pylint: disable=undefined-variable
     ) -> None:
         super().__init__(kokkos_state)
 
diff --git a/pennylane_lightning/lightning_kokkos/_state_vector.py b/pennylane_lightning/lightning_kokkos/_state_vector.py
index dda40ffad..9073a9dd8 100644
--- a/pennylane_lightning/lightning_kokkos/_state_vector.py
+++ b/pennylane_lightning/lightning_kokkos/_state_vector.py
@@ -14,6 +14,7 @@
 """
 Class implementation for lightning_kokkos state-vector manipulation.
 """
+from warnings import warn
 
 try:
     from pennylane_lightning.lightning_kokkos_ops import (
@@ -23,8 +24,10 @@
         allocate_aligned_array,
         print_configuration,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+from typing import Union
 
 import numpy as np
 import pennylane as qml
@@ -59,17 +62,16 @@ class LightningKokkosStateVector(LightningBaseStateVector):
 
     def __init__(
         self,
-        num_wires,
-        dtype=np.complex128,
+        num_wires: int,
+        dtype: Union[np.complex128, np.complex64] = np.complex128,
         kokkos_args=None,
-        sync=True,
-    ):  # pylint: disable=too-many-arguments
+    ):
+
         super().__init__(num_wires, dtype)
 
         self._device_name = "lightning.kokkos"
 
         self._kokkos_config = {}
-        self._sync = sync
 
         # Initialize the state vector
         if kokkos_args is None:
@@ -143,7 +145,7 @@ def sync_d2h(self, state_vector):
 
         >>> dev = qml.device('lightning.kokkos', wires=1)
         >>> dev.apply([qml.PauliX(wires=[0])])
-        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.C_DTYPE)
+        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.c_dtype)
         >>> dev.sync_d2h(state_vector)
         >>> print(state_vector)
         [0.+0.j 1.+0.j]
@@ -269,9 +271,12 @@ def _apply_lightning(
                 )
             elif isinstance(operation, qml.PauliRot):
                 method = getattr(state, "applyPauliRot")
-                paulis = operation._hyperparameters["pauli_word"]
+                # pylint: disable=protected-access
+                paulis = operation._hyperparameters[
+                    "pauli_word"
+                ]  # pylint: disable=protected-access
                 wires = [i for i, w in zip(wires, paulis) if w != "I"]
-                word = "".join(p for p in paulis if p != "I")  # pylint: disable=protected-access
+                word = "".join(p for p in paulis if p != "I")
                 method(wires, invert_param, operation.parameters, word)
             elif method is not None:  # apply specialized gate
                 param = operation.parameters
diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
index 51221dde9..b30ca1ad2 100644
--- a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
+++ b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
@@ -20,7 +20,7 @@
 from dataclasses import replace
 from functools import reduce
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Union
 from warnings import warn
 
 import numpy as np
@@ -50,10 +50,6 @@
     Result_or_ResultBatch,
 )
 
-from ._adjoint_jacobian import LightningKokkosAdjointJacobian
-from ._measurements import LightningKokkosMeasurements
-from ._state_vector import LightningKokkosStateVector
-
 try:
     from pennylane_lightning.lightning_kokkos_ops import backend_info, print_configuration
 
@@ -63,6 +59,10 @@
     LK_CPP_BINARY_AVAILABLE = False
     backend_info = None
 
+from ._adjoint_jacobian import LightningKokkosAdjointJacobian
+from ._measurements import LightningKokkosMeasurements
+from ._state_vector import LightningKokkosStateVector
+
 # The set of supported operations.
 _operations = frozenset(
     {
@@ -289,13 +289,12 @@ class LightningKokkos(LightningBase):
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
-        wires,
+        wires: Union[int, List],
         *,
-        c_dtype=np.complex128,
-        shots=None,
-        batch_obs=False,
+        c_dtype: Union[np.complex128, np.complex64] = np.complex128,
+        shots: Union[int, List] = None,
+        batch_obs: bool = False,
         # Kokkos arguments
-        sync=True,
         kokkos_args=None,
     ):
         if not self._CPP_BINARY_AVAILABLE:
@@ -317,11 +316,10 @@ def __init__(  # pylint: disable=too-many-arguments
 
         # Kokkos specific options
         self._kokkos_args = kokkos_args
-        self._sync = sync
 
         # Creating the state vector
         self._statevector = self.LightningStateVector(
-            num_wires=len(self.wires), dtype=c_dtype, kokkos_args=kokkos_args, sync=sync
+            num_wires=len(self.wires), dtype=c_dtype, kokkos_args=kokkos_args
         )
 
         if not LightningKokkos.kokkos_config:
@@ -492,7 +490,7 @@ def simulate(
                     aux_circ, mid_measurements=mid_measurements, postselect_mode=postselect_mode
                 )
                 results.append(
-                    LightningKokkosMeasurements(final_state).measure_final_state(
+                    self.LightningMeasurements(final_state).measure_final_state(
                         aux_circ, mid_measurements=mid_measurements
                     )
                 )
@@ -500,7 +498,7 @@ def simulate(
 
         state.reset_state()
         final_state = state.get_final_state(circuit)
-        return LightningKokkosMeasurements(final_state).measure_final_state(circuit)
+        return self.LightningMeasurements(final_state).measure_final_state(circuit)
 
     @staticmethod
     def get_c_interface():
diff --git a/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py b/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py
index 0abc7f72f..390c0cf69 100644
--- a/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py
+++ b/pennylane_lightning/lightning_qubit/_adjoint_jacobian.py
@@ -14,6 +14,9 @@
 r"""
 Internal methods for adjoint Jacobian differentiation method.
 """
+from __future__ import annotations
+
+from warnings import warn
 
 try:
     from pennylane_lightning.lightning_qubit_ops.algorithms import (
@@ -22,8 +25,8 @@
         create_ops_listC64,
         create_ops_listC128,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 from os import getenv
 
@@ -34,8 +37,6 @@
 # pylint: disable=ungrouped-imports
 from pennylane_lightning.core._adjoint_jacobian_base import LightningBaseAdjointJacobian
 
-from ._state_vector import LightningStateVector
-
 
 class LightningAdjointJacobian(
     LightningBaseAdjointJacobian
@@ -47,7 +48,12 @@ class LightningAdjointJacobian(
         batch_obs(bool): If serialized tape is to be batched or not.
     """
 
-    def __init__(self, qubit_state: LightningStateVector, batch_obs: bool = False) -> None:
+    def __init__(
+        self,
+        qubit_state: LightningStateVector,  # pylint: disable=undefined-variable
+        batch_obs: bool = False,
+    ) -> None:
+
         super().__init__(qubit_state, batch_obs)
 
         # Initialize the C++ binds
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index c1b97a118..415ce7408 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -16,10 +16,14 @@
 """
 
 # pylint: disable=import-error, no-name-in-module, ungrouped-imports
+from __future__ import annotations
+
+from warnings import warn
+
 try:
     from pennylane_lightning.lightning_qubit_ops import MeasurementsC64, MeasurementsC128
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
 
 from functools import reduce
 from typing import List
@@ -53,7 +57,7 @@ class LightningMeasurements(LightningBaseMeasurements):  # pylint: disable=too-f
 
     def __init__(
         self,
-        qubit_state,
+        qubit_state: LightningStateVector,  # pylint: disable=undefined-variable
         mcmc: bool = None,
         kernel_name: str = None,
         num_burnin: int = None,
diff --git a/pennylane_lightning/lightning_qubit/_state_vector.py b/pennylane_lightning/lightning_qubit/_state_vector.py
index b4b6ef5ff..62068dcbd 100644
--- a/pennylane_lightning/lightning_qubit/_state_vector.py
+++ b/pennylane_lightning/lightning_qubit/_state_vector.py
@@ -14,6 +14,7 @@
 """
 Class implementation for lightning_qubit state-vector manipulation.
 """
+from warnings import warn
 
 try:
     from pennylane_lightning.lightning_qubit_ops import (
@@ -21,8 +22,10 @@
         StateVectorC128,
         allocate_aligned_array,
     )
-except ImportError:
-    pass
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+from typing import Union
 
 import numpy as np
 import pennylane as qml
@@ -50,7 +53,8 @@ class LightningStateVector(LightningBaseStateVector):  # pylint: disable=too-few
         device_name(string): state vector device name. Options: ["lightning.qubit"]
     """
 
-    def __init__(self, num_wires, dtype=np.complex128):
+    def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64] = np.complex128):
+
         super().__init__(num_wires, dtype)
 
         self._device_name = "lightning.qubit"
diff --git a/pennylane_lightning/lightning_qubit/lightning_qubit.py b/pennylane_lightning/lightning_qubit/lightning_qubit.py
index c317bbfba..abf080978 100644
--- a/pennylane_lightning/lightning_qubit/lightning_qubit.py
+++ b/pennylane_lightning/lightning_qubit/lightning_qubit.py
@@ -18,7 +18,7 @@
 from dataclasses import replace
 from functools import reduce
 from pathlib import Path
-from typing import Optional, Sequence
+from typing import List, Optional, Sequence, Union
 from warnings import warn
 
 import numpy as np
@@ -48,10 +48,6 @@
     Result_or_ResultBatch,
 )
 
-from ._adjoint_jacobian import LightningAdjointJacobian
-from ._measurements import LightningMeasurements
-from ._state_vector import LightningStateVector
-
 try:
     from pennylane_lightning.lightning_qubit_ops import backend_info
 
@@ -60,6 +56,10 @@
     warn(str(ex), UserWarning)
     LQ_CPP_BINARY_AVAILABLE = False
 
+from ._adjoint_jacobian import LightningAdjointJacobian
+from ._measurements import LightningMeasurements
+from ._state_vector import LightningStateVector
+
 # The set of supported operations.
 _operations = frozenset(
     {
@@ -323,16 +323,16 @@ class LightningQubit(LightningBase):
 
     def __init__(  # pylint: disable=too-many-arguments
         self,
-        wires,
+        wires: Union[int, List],
         *,
-        c_dtype=np.complex128,
-        shots=None,
-        batch_obs=False,
+        c_dtype: Union[np.complex128, np.complex64] = np.complex128,
+        shots: Union[int, List] = None,
+        batch_obs: bool = False,
         # Markov Chain Monte Carlo (MCMC) sampling method arguments
-        seed="global",
-        mcmc=False,
-        kernel_name="Local",
-        num_burnin=100,
+        seed: Union[str, int] = "global",
+        mcmc: bool = False,
+        kernel_name: str = "Local",
+        num_burnin: int = 100,
     ):
         if not self._CPP_BINARY_AVAILABLE:
             raise ImportError(
@@ -559,4 +559,4 @@ def simulate(
 
         state.reset_state()
         final_state = state.get_final_state(circuit)
-        return LightningMeasurements(final_state, **mcmc).measure_final_state(circuit)
+        return self.LightningMeasurements(final_state, **mcmc).measure_final_state(circuit)
diff --git a/tests/conftest.py b/tests/conftest.py
index a64841846..1c06ae0dc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -142,10 +142,15 @@ def get_device():
         from pennylane_lightning.lightning_kokkos_ops import LightningException
 elif device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
-
-    LightningAdjointJacobian = None
-    LightningMeasurements = None
-    LightningStateVector = None
+    from pennylane_lightning.lightning_gpu._adjoint_jacobian import (
+        LightningGPUAdjointJacobian as LightningAdjointJacobian,
+    )
+    from pennylane_lightning.lightning_gpu._measurements import (
+        LightningGPUMeasurements as LightningMeasurements,
+    )
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
 
     if hasattr(pennylane_lightning, "lightning_gpu_ops"):
         import pennylane_lightning.lightning_gpu_ops as lightning_ops
diff --git a/tests/lightning_qubit/test_adjoint_jacobian_class.py b/tests/lightning_qubit/test_adjoint_jacobian_class.py
index 236c697f1..7fbaa19b1 100644
--- a/tests/lightning_qubit/test_adjoint_jacobian_class.py
+++ b/tests/lightning_qubit/test_adjoint_jacobian_class.py
@@ -481,6 +481,7 @@ def test_hermitian_expectation(self, tol, lightning_sv):
             tape.trainable_params = {0}
 
             statevector.reset_state()
+
             vjp = self.calculate_vjp(statevector, tape, dy)
 
             assert np.allclose(vjp, -0.8 * np.sin(x), atol=tol)
@@ -498,6 +499,7 @@ def test_hermitian_tensor_expectation(self, tol, lightning_sv):
             tape.trainable_params = {0}
 
             statevector.reset_state()
+
             vjp = self.calculate_vjp(statevector, tape, dy)
 
             assert np.allclose(vjp, -0.8 * np.sin(x), atol=tol)
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index 471fb6de6..c5c61e054 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -669,8 +669,8 @@ def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_s
             assert np.allclose(r, e, atol=dtol, rtol=dtol)
 
     @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support out of order probs.",
+        device_name in ("lightning.gpu", "lightning.tensor"),
+        reason=f"{device_name} does not support out of order probs.",
     )
     @pytest.mark.parametrize(
         "cases",
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index 3918afcd5..b3baaa3ea 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -30,6 +30,9 @@
     except ImportError:
         pass
 
+if device_name == "lightning.gpu":
+    from pennylane_lightning.lightning_gpu._mpi_handler import MPIHandler
+
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 
@@ -39,6 +42,7 @@
         allow_module_level=True,
     )
 
+
 if not LightningDevice._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
@@ -86,10 +90,18 @@ def test_apply_state_vector_with_lightning_handle(tol):
     state_vector_1 = LightningStateVector(2)
     state_vector_1.apply_operations([qml.BasisState(np.array([0, 1]), wires=[0, 1])])
 
-    state_vector_2 = LightningStateVector(2)
-    state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+    if device_name == "lightning.gpu":
+        with pytest.raises(
+            qml.DeviceError, match="LightningGPU does not support allocate external state_vector."
+        ):
+            state_vector_2 = LightningStateVector(2)
+            state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+
+    else:
+        state_vector_2 = LightningStateVector(2)
+        state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
 
-    assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
+        assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/new_api/test_device.py b/tests/new_api/test_device.py
index 0485f3a05..111dd3af7 100644
--- a/tests/new_api/test_device.py
+++ b/tests/new_api/test_device.py
@@ -43,8 +43,7 @@
         validate_measurements,
         validate_observables,
     )
-
-if device_name == "lightning.kokkos":
+elif device_name == "lightning.kokkos":
     from pennylane_lightning.lightning_kokkos.lightning_kokkos import (
         _add_adjoint_transforms,
         _adjoint_ops,
@@ -62,13 +61,31 @@
         validate_measurements,
         validate_observables,
     )
-
-
-if device_name == "lightning.tensor":
+elif device_name == "lightning.gpu":
+    from pennylane_lightning.lightning_gpu.lightning_gpu import (
+        _add_adjoint_transforms,
+        _adjoint_ops,
+        _supports_adjoint,
+        accepted_observables,
+        adjoint_measurements,
+        adjoint_observables,
+        decompose,
+        mid_circuit_measurements,
+        no_sampling,
+        stopping_condition,
+        stopping_condition_shots,
+        validate_adjoint_trainable_params,
+        validate_device_wires,
+        validate_measurements,
+        validate_observables,
+    )
+elif device_name == "lightning.tensor":
     from pennylane_lightning.lightning_tensor.lightning_tensor import (
         accepted_observables,
         stopping_condition,
     )
+else:
+    raise TypeError(f"The device name: {device_name} is not a valid name")
 
 if not LightningDevice._new_API:
     pytest.skip("Exclusive tests for new device API. Skipping.", allow_module_level=True)
@@ -448,6 +465,11 @@ def test_execute_single_measurement(self, theta, phi, mp, dev):
         if isinstance(mp.obs, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             mp.obs = qml.operation.convert_to_legacy_H(mp.obs)
 
+        if isinstance(mp.obs, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         qs = QuantumScript(
             [
                 qml.RX(phi, 0),
@@ -641,6 +663,12 @@ def test_supports_derivatives(self, dev, config, tape, expected, batch_obs):
             qml.Z(1) + qml.X(1),
             qml.Hamiltonian([-1.0, 1.5], [qml.Z(1), qml.X(1)]),
             qml.Hermitian(qml.Hadamard.compute_matrix(), 0),
+            qml.SparseHamiltonian(
+                qml.Hamiltonian([-1.0, 1.5], [qml.Z(1), qml.X(1)]).sparse_matrix(
+                    wire_order=[0, 1, 2]
+                ),
+                wires=[0, 1, 2],
+            ),
             qml.Projector([1], 1),
         ],
     )
@@ -649,6 +677,11 @@ def test_derivatives_single_expval(
         self, theta, phi, dev, obs, execute_and_derivatives, batch_obs
     ):
         """Test that the jacobian is correct when a tape has a single expectation value"""
+        if isinstance(obs, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         if isinstance(obs, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             obs = qml.operation.convert_to_legacy_H(obs)
 
@@ -705,6 +738,11 @@ def test_derivatives_multi_expval(
         self, theta, phi, omega, dev, obs1, obs2, execute_and_derivatives, batch_obs
     ):
         """Test that the jacobian is correct when a tape has multiple expectation values"""
+        if isinstance(obs2, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         if isinstance(obs1, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             obs1 = qml.operation.convert_to_legacy_H(obs1)
         if isinstance(obs2, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
@@ -1074,6 +1112,11 @@ def test_vjp_multi_expval(
         self, theta, phi, omega, dev, obs1, obs2, execute_and_derivatives, batch_obs
     ):
         """Test that the VJP is correct when a tape has multiple expectation values"""
+        if isinstance(obs2, qml.SparseHamiltonian) and dev.dtype == np.complex64:
+            pytest.skip(
+                reason="The conversion from qml.Hamiltonian to SparseHamiltonian is only possible with np.complex128"
+            )
+
         if isinstance(obs1, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             obs1 = qml.operation.convert_to_legacy_H(obs1)
         if isinstance(obs2, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
diff --git a/tests/test_var.py b/tests/test_var.py
index 4b4e8561f..7bdcec2c2 100644
--- a/tests/test_var.py
+++ b/tests/test_var.py
@@ -24,7 +24,6 @@
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
-
 np.random.seed(42)